From a3730dc87bc61593514b830727e36e5d19e753cd Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Sun, 5 Jun 2022 11:11:31 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90code=20format=20check=20upgrade?=
 =?UTF-8?q?=E3=80=91=20step2=EF=BC=9Aclang-format=20(#42840)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml                       |   4 +
 .../fluid/distributed/collective/HCCLTools.cc |   1 +
 .../fluid/distributed/collective/HCCLTools.h  |   1 +
 .../fluid/distributed/collective/NCCLTools.cc |   1 +
 .../fluid/distributed/collective/NCCLTools.h  |   4 +-
 .../distributed/collective/ProcessGroup.h     |   1 -
 .../collective/ProcessGroupGloo.cc            |   6 +-
 .../collective/ProcessGroupHCCL.cc            |  20 +-
 .../distributed/collective/ProcessGroupHCCL.h |   5 +-
 .../collective/ProcessGroupHeter.cc           |  27 +-
 .../collective/ProcessGroupNCCL.cc            | 132 +--
 .../distributed/collective/ProcessGroupNCCL.h |   3 +-
 .../fluid/distributed/collective/reducer.cc   |   5 +-
 paddle/fluid/distributed/collective/reducer.h |   1 +
 .../fluid/distributed/common/afs_warpper.cc   |   8 +-
 paddle/fluid/distributed/common/afs_warpper.h |   1 +
 paddle/fluid/distributed/common/cost_timer.h  |   1 +
 .../fluid/distributed/common/local_random.h   |   1 +
 paddle/fluid/distributed/common/registerer.h  |   1 +
 .../distributed/fleet_executor/carrier.cc     |   8 +-
 .../distributed/fleet_executor/carrier.h      |   2 +-
 .../fleet_executor/compute_interceptor.cc     |   2 +-
 .../distributed/fleet_executor/dist_model.cc  |   9 +-
 .../distributed/fleet_executor/dist_model.h   |   2 +-
 .../dist_model_tensor_wrapper.cc              |   1 +
 .../dist_model_tensor_wrapper.h               |   1 +
 .../fleet_executor/fleet_executor.cc          |   3 +-
 .../fleet_executor/fleet_executor.h           |   2 +-
 .../distributed/fleet_executor/interceptor.cc |   1 +
 .../distributed/fleet_executor/interceptor.h  |   2 +-
 .../distributed/fleet_executor/message_bus.cc |   8 +-
 .../fleet_executor/message_service.cc         |   1 +
 .../fleet_executor/runtime_graph.cc           |   1 +
 .../fleet_executor/runtime_graph.h            |   1 +
 .../fleet_executor/sink_interceptor.cc        |   1 +
 .../fleet_executor/source_interceptor.cc      |   1 +
 .../fleet_executor/task_loop_thread.cc        |   5 +-
 .../fleet_executor/task_loop_thread_pool.cc   |  20 +-
 .../distributed/fleet_executor/task_node.cc   |  11 +-
 .../distributed/fleet_executor/task_node.h    |   2 +-
 .../test/compute_interceptor_run_op_test.cc   |   1 -
 .../test/compute_interceptor_test.cc          |   1 -
 .../test/interceptor_ping_pong_test.cc        |   1 -
 .../interceptor_ping_pong_with_brpc_test.cc   |   2 +-
 .../interceptor_pipeline_long_path_test.cc    |   1 -
 .../interceptor_pipeline_short_path_test.cc   |   1 -
 .../test/sink_interceptor_test.cc             |   1 -
 .../test/source_interceptor_test.cc           |   1 -
 .../index_dataset/index_sampler.cc            |   1 +
 .../distributed/index_dataset/index_sampler.h |   1 +
 .../index_dataset/index_wrapper.cc            |   5 +-
 .../distributed/index_dataset/index_wrapper.h |  10 +-
 .../distributed/ps/service/brpc_ps_client.cc  |  13 +-
 .../distributed/ps/service/brpc_ps_client.h   |   1 +
 .../distributed/ps/service/brpc_ps_server.cc  |   2 +
 .../fluid/distributed/ps/service/brpc_utils.h |   1 +
 .../ps/service/communicator/communicator.cc   |   2 +
 .../ps/service/communicator/communicator.h    |   9 +-
 paddle/fluid/distributed/ps/service/env.h     |   2 +
 .../ps/service/graph_brpc_client.cc           |   8 +-
 .../ps/service/graph_brpc_client.h            |   3 +-
 .../ps/service/graph_brpc_server.cc           |   3 +-
 .../ps/service/graph_brpc_server.h            |   6 +-
 .../fluid/distributed/ps/service/ps_client.cc |   1 +
 .../fluid/distributed/ps/service/ps_client.h  |   1 +
 .../distributed/ps/service/ps_local_client.cc |   5 +-
 .../distributed/ps/service/ps_local_client.h  |   4 +-
 .../distributed/ps/service/ps_local_server.h  |   5 +-
 .../ps/service/ps_service/graph_py_service.cc |   6 +-
 .../ps/service/ps_service/graph_py_service.h  |  16 +-
 .../ps/service/ps_service/service.cc          |   2 +
 paddle/fluid/distributed/ps/service/server.h  |   1 +
 paddle/fluid/distributed/ps/table/accessor.h  |   2 +
 .../ps/table/common_graph_table.cc            |   5 +-
 .../distributed/ps/table/common_graph_table.h |   4 +-
 .../fluid/distributed/ps/table/common_table.h |   3 +-
 .../distributed/ps/table/ctr_accessor.cc      |   2 +
 .../fluid/distributed/ps/table/ctr_accessor.h |   2 +
 .../ps/table/ctr_double_accessor.cc           |   2 +
 .../ps/table/ctr_double_accessor.h            |   2 +
 .../distributed/ps/table/ctr_dymf_accessor.cc |   2 +
 .../distributed/ps/table/ctr_dymf_accessor.h  |   2 +
 .../distributed/ps/table/depends/dense.h      |   3 +-
 .../ps/table/depends/feature_value.h          |   4 +-
 .../ps/table/depends/geo_recorder.h           |   1 +
 .../ps/table/depends/initializers.h           |   3 +-
 .../ps/table/depends/rocksdb_warpper.h        |   5 +-
 .../distributed/ps/table/graph/graph_edge.cc  |   5 +-
 .../distributed/ps/table/graph/graph_edge.h   |   4 +-
 .../distributed/ps/table/graph/graph_node.cc  |   1 +
 .../distributed/ps/table/graph/graph_node.h   |   1 +
 .../ps/table/graph/graph_weighted_sampler.cc  |   2 +
 .../ps/table/graph/graph_weighted_sampler.h   |   1 +
 .../distributed/ps/table/memory_dense_table.h |   2 +
 .../ps/table/memory_sparse_geo_table.h        |   1 +
 .../ps/table/memory_sparse_table.cc           |  14 +-
 .../ps/table/memory_sparse_table.h            |   2 +
 .../distributed/ps/table/sparse_accessor.cc   |   2 +
 .../distributed/ps/table/sparse_accessor.h    |   2 +
 .../distributed/ps/table/sparse_sgd_rule.cc   |   2 +
 .../distributed/ps/table/sparse_sgd_rule.h    |   2 +
 .../distributed/ps/table/ssd_sparse_table.cc  |  11 +-
 paddle/fluid/distributed/ps/table/table.cc    |   4 +-
 paddle/fluid/distributed/ps/table/table.h     |   2 +
 .../distributed/ps/table/tensor_accessor.cc   |   1 +
 .../distributed/ps/table/tensor_accessor.h    |   1 +
 paddle/fluid/distributed/ps/wrapper/fleet.cc  |   3 +-
 paddle/fluid/distributed/ps/wrapper/fleet.h   |   2 +-
 .../fluid/distributed/ps/wrapper/ps_wrapper.h |   2 +-
 paddle/fluid/distributed/store/tcp_store.cc   |   3 +-
 paddle/fluid/distributed/store/tcp_utils.cc   |  24 +-
 paddle/fluid/distributed/store/tcp_utils.h    |  15 +-
 .../distributed/test/barrier_table_test.cc    |   2 +
 .../test/brpc_service_dense_sgd_test.cc       |   1 +
 .../test/brpc_service_sparse_sgd_test.cc      |   1 +
 .../fluid/distributed/test/brpc_utils_test.cc |   4 +-
 .../distributed/test/ctr_accessor_test.cc     |   2 +
 .../test/ctr_dymf_accessor_test.cc            |   2 +
 .../distributed/test/dense_table_test.cc      |   2 +
 .../distributed/test/feature_value_test.cc    |   2 +
 .../distributed/test/graph_node_split_test.cc |   3 +-
 .../fluid/distributed/test/graph_node_test.cc |   6 +-
 .../test/graph_table_sample_test.cc           |   5 +-
 .../distributed/test/memory_geo_table_test.cc |   2 +-
 .../test/memory_sparse_table_test.cc          |   5 +-
 .../distributed/test/sparse_sgd_rule_test.cc  |   2 +
 paddle/fluid/distributed/test/table_test.cc   |   2 +-
 .../eager/accumulation/accumulation_node.cc   |  13 +-
 .../eager/accumulation/accumulation_node.h    |   3 +-
 paddle/fluid/eager/amp_utils.h                |   1 +
 .../eager_generated/backwards/scale_node.cc   |  11 +-
 .../eager_generated/backwards/scale_node.h    |   3 +-
 .../eager_generated/forwards/scale.cc         |   2 +-
 paddle/fluid/eager/api/utils/global_utils.h   |   6 +-
 paddle/fluid/eager/api/utils/hook_utils.cc    |   1 +
 paddle/fluid/eager/api/utils/tensor_utils.cc  |   5 +-
 paddle/fluid/eager/backward.cc                |  18 +-
 .../custom_operator/custom_operator_node.cc   |   1 +
 paddle/fluid/eager/grad_node_info.cc          |  15 +-
 paddle/fluid/eager/grad_node_info.h           |  19 +-
 paddle/fluid/eager/grad_tensor_holder.cc      |   2 +-
 paddle/fluid/eager/hooks.h                    |   1 +
 paddle/fluid/eager/pylayer/py_layer_node.cc   |  13 +-
 paddle/fluid/eager/pylayer/py_layer_node.h    |   3 +-
 .../accumulation_node_test.cc                 |   4 +-
 .../autograd_meta_test.cc                     |   4 +-
 .../data_structure_tests/eager_tensor_test.cc |   6 +-
 .../grad_node_info_test.cc                    |   8 +-
 .../data_structure_tests/grad_node_test.h     |   4 +-
 .../grad_tensor_holder_test.cc                |   7 +-
 .../tensor_wrapper_test.cc                    |   4 +-
 .../performance_tests/benchmark_eager_cpu.cc  |   8 +-
 .../performance_tests/benchmark_eager_cuda.cc |   8 +-
 .../performance_tests/benchmark_fluid_cpu.cc  |   1 -
 .../performance_tests/benchmark_fluid_cuda.cc |   1 -
 .../tests/performance_tests/benchmark_utils.h |   1 +
 .../eager/tests/task_tests/backward_test.cc   |  11 +-
 .../cross_batch_accumulation_test.cc          |  11 +-
 .../tests/task_tests/eager_utils_test.cc      |   2 -
 .../tests/task_tests/forward_autograd_test.cc |   5 +-
 .../tests/task_tests/fwd_bwd_joint_test.cc    |   8 +-
 .../eager/tests/task_tests/generated_test.cc  |   7 +-
 .../fluid/eager/tests/task_tests/grad_test.cc |   5 +-
 .../fluid/eager/tests/task_tests/hook_test.cc |  11 +-
 .../task_tests/hook_test_intermidiate.cc      |   6 +-
 .../tests/task_tests/nan_inf_utils_test.cc    |   4 +-
 .../tests/task_tests/tensor_utils_test.cc     |   5 +-
 paddle/fluid/eager/tests/test_utils.h         |   8 +-
 .../eager/to_static/run_program_op_node.h     |   6 +-
 paddle/fluid/eager/utils.h                    |  14 +-
 paddle/fluid/framework/archive.h              |   2 +
 paddle/fluid/framework/async_executor.cc      |   4 +-
 paddle/fluid/framework/async_executor.h       |   2 +
 paddle/fluid/framework/attribute.h            |   1 +
 paddle/fluid/framework/attribute_test.cc      |   6 +-
 paddle/fluid/framework/channel.h              |   2 +
 paddle/fluid/framework/convert_utils_test.cc  |   1 +
 .../fluid/framework/copy_same_tensor_test.cc  |   1 +
 paddle/fluid/framework/custom_operator.cc     |  60 +-
 .../framework/data_device_transform_test.cu   |   4 +-
 paddle/fluid/framework/data_feed.cc           |  19 +-
 paddle/fluid/framework/data_feed_factory.cc   |   1 +
 paddle/fluid/framework/data_feed_test.cc      |   3 +
 paddle/fluid/framework/data_set.cc            |   1 +
 paddle/fluid/framework/data_set.h             |   2 +
 paddle/fluid/framework/data_type_test.cc      |   2 +-
 .../framework/data_type_transform_test.cu     |   3 +-
 .../bind_threaded_ssa_graph_executor.cc       |   2 +
 .../bind_threaded_ssa_graph_executor.h        |   2 +
 .../fluid/framework/details/bkcl_op_handle.h  |   3 +-
 .../fluid/framework/details/build_strategy.cc |   1 +
 .../framework/details/build_strategy_test.cc  |   4 +-
 .../fluid/framework/details/cow_ptr_test.cc   |   1 +
 .../framework/details/execution_strategy.h    |   1 +
 .../fast_threaded_ssa_graph_executor.h        |   2 +
 .../details/fused_all_reduce_op_handle.cc     |  16 +-
 .../grad_merge_all_reduce_op_handle.cc        |   1 +
 .../fluid/framework/details/graph_test_base.h |   1 +
 .../framework/details/nan_inf_utils_detail.cc |  10 +-
 .../framework/details/nan_inf_utils_detail.cu |   5 +-
 paddle/fluid/framework/details/op_registry.h  |  38 +-
 .../details/parallel_ssa_graph_executor.cc    |   7 +-
 .../details/parallel_ssa_graph_executor.h     |   1 +
 .../framework/details/reduce_op_handle.cc     |  17 +-
 .../fluid/framework/details/rpc_op_handle.cc  |   1 +
 .../details/scope_buffered_monitor.cc         |   1 +
 .../scope_buffered_ssa_graph_executor.h       |   2 +
 .../details/sparse_all_reduce_op_handle.cc    |  16 +-
 paddle/fluid/framework/device_worker.cc       |   7 +-
 .../fluid/framework/device_worker_factory.cc  |   1 +
 paddle/fluid/framework/dlpack_tensor.cc       |   1 +
 paddle/fluid/framework/dlpack_tensor_test.cc  |   3 +-
 .../fluid/framework/downpour_lite_worker.cc   |   8 +-
 paddle/fluid/framework/downpour_worker.cc     |  23 +-
 paddle/fluid/framework/eigen_test.cc          |   3 +-
 paddle/fluid/framework/executor.cc            |   7 +-
 paddle/fluid/framework/executor_cache.cc      |   1 +
 .../fluid/framework/executor_thread_worker.cc |  19 +-
 .../fluid/framework/executor_thread_worker.h  |   1 +
 paddle/fluid/framework/feed_fetch_method.cc   |   2 +-
 paddle/fluid/framework/fleet/ascend_wrapper.h |   9 +-
 paddle/fluid/framework/fleet/box_wrapper.cc   |  22 +-
 paddle/fluid/framework/fleet/box_wrapper.cu   |  15 +-
 paddle/fluid/framework/fleet/box_wrapper.h    |  37 +-
 .../fluid/framework/fleet/box_wrapper_impl.h  |  10 +-
 paddle/fluid/framework/fleet/fleet_wrapper.h  |   1 +
 paddle/fluid/framework/fleet/gloo_wrapper.cc  |   1 +
 paddle/fluid/framework/fleet/gloo_wrapper.h   |   5 +-
 paddle/fluid/framework/fleet/heter_context.h  |   1 +
 .../cudf/concurrent_unordered_map.cuh.h       |   3 +-
 .../framework/fleet/heter_ps/gpu_graph_node.h |   5 +-
 .../fleet/heter_ps/graph_gpu_ps_table.h       |   6 +-
 .../fleet/heter_ps/graph_gpu_ps_table_inl.cu  |  40 +-
 .../fleet/heter_ps/graph_gpu_wrapper.cu       |   4 +-
 .../fleet/heter_ps/graph_gpu_wrapper.h        |   5 +-
 .../framework/fleet/heter_ps/graph_sampler.h  |   6 +-
 .../fleet/heter_ps/graph_sampler_inl.h        |   4 +-
 .../framework/fleet/heter_ps/hashtable.h      |   2 +
 .../fleet/heter_ps/hashtable_kernel.cu        |  38 +-
 .../framework/fleet/heter_ps/heter_comm.h     |   2 +
 .../framework/fleet/heter_ps/heter_comm_inl.h |   1 +
 .../fleet/heter_ps/heter_comm_kernel.cu       |   8 +-
 .../framework/fleet/heter_ps/heter_ps.cc      |   1 +
 .../framework/fleet/heter_ps/heter_ps.cu      |   1 +
 .../fluid/framework/fleet/heter_ps/heter_ps.h |   1 +
 .../framework/fleet/heter_ps/heter_ps_base.h  |   1 +
 .../framework/fleet/heter_ps/heter_resource.h |   1 +
 .../framework/fleet/heter_ps/optimizer.cuh.h  |   1 +
 .../framework/fleet/heter_ps/test_comm.cu     |   2 +
 .../fleet/heter_ps/test_cpu_graph_sample.cu   |   2 +
 .../fleet/heter_ps/test_cpu_query.cu          |   2 +
 .../framework/fleet/heter_ps/test_graph.cu    |   2 +
 .../fleet/heter_ps/test_sample_rate.cu        |  20 +-
 paddle/fluid/framework/fleet/metrics.cc       |  21 +-
 paddle/fluid/framework/fleet/metrics.h        |   2 +
 .../fluid/framework/fleet/ps_gpu_wrapper.cu   |   1 +
 paddle/fluid/framework/fleet/ps_gpu_wrapper.h |   1 +
 paddle/fluid/framework/fleet/test_fleet.cc    |   1 +
 paddle/fluid/framework/generator.cc           |   1 +
 paddle/fluid/framework/generator.h            |   1 +
 paddle/fluid/framework/gpu_utils.h            |  17 +-
 paddle/fluid/framework/grad_op_desc_maker.h   |   6 +-
 paddle/fluid/framework/heter_service.h        |   1 +
 paddle/fluid/framework/hetercpu_worker.cc     |  23 +-
 paddle/fluid/framework/heterxpu_trainer.cc    |   1 +
 .../fluid/framework/infershape_utils_test.cc  |   4 +-
 paddle/fluid/framework/inplace_op_inference.h |   1 +
 .../framework/io/crypto/aes_cipher_test.cc    |   3 +
 paddle/fluid/framework/io/crypto/cipher.cc    |   1 +
 .../fluid/framework/io/crypto/cipher_utils.cc |   1 +
 .../framework/io/crypto/cipher_utils_test.cc  |   5 +-
 paddle/fluid/framework/io/fs.cc               |   1 +
 paddle/fluid/framework/io/fs.h                |   1 +
 paddle/fluid/framework/io/test_fs.cc          |   2 +
 ...ptive_pool2d_convert_global_pass_tester.cc |   4 +-
 .../framework/ir/add_support_int8_pass.cc     |   5 +-
 .../framework/ir/coalesce_grad_tensor_pass.cc |   2 +
 .../framework/ir/conv_bn_fuse_pass_tester.cc  |   4 +-
 paddle/fluid/framework/ir/cost_model.cc       |   1 +
 paddle/fluid/framework/ir/cost_model_test.cc  |   1 +
 .../ir/cudnn_placement_pass_tester.cc         |   4 +-
 .../framework/ir/delete_dropout_op_pass.cc    |   4 +-
 .../ir/delete_fill_constant_op_pass.cc        |   1 +
 .../ir/delete_quant_dequant_filter_op_pass.cc |   7 +-
 ...ding_eltwise_layernorm_fuse_pass_tester.cc |   4 +-
 .../ir/embedding_fc_lstm_fuse_pass.cc         |   1 +
 ..._elementwise_layernorm_fuse_pass_tester.cc |   3 +-
 paddle/fluid/framework/ir/fc_fuse_pass.cc     |   1 +
 .../fluid/framework/ir/fc_fuse_pass_tester.cc |   4 +-
 .../framework/ir/fc_gru_fuse_pass_tester.h    |   4 +-
 .../fluid/framework/ir/fc_lstm_fuse_pass.cc   |   1 +
 .../framework/ir/fc_lstm_fuse_pass_tester.h   |   4 +-
 .../ir/fillconstant_elementwisemul_fuse.h     |   1 +
 paddle/fluid/framework/ir/fuse_bn_act_pass.cc |   2 +
 .../framework/ir/fuse_bn_add_act_pass.cc      |   2 +
 .../framework/ir/fuse_elewise_add_act_pass.cc |   2 +
 .../framework/ir/fuse_gemm_epilogue_pass.cc   |   2 +
 .../fuse_adam_op_pass.cc                      |   1 +
 .../fuse_momentum_op_pass.cc                  |   5 +-
 .../fuse_optimizer_op_pass.cc                 |   1 +
 .../ir/fuse_relu_depthwise_conv_pass.cc       |   2 +
 .../ir/fusion_group/code_generator.cc         |   1 +
 .../ir/fusion_group/code_generator_helper.cc  |   1 +
 .../ir/fusion_group/code_generator_tester.cc  |   1 +
 .../elementwise_group_detector.cc             |   1 +
 .../ir/fusion_group/fusion_group_pass.cc      |   1 +
 .../fusion_group/fusion_group_pass_tester.cc  |   4 +-
 .../framework/ir/fusion_group/operation.cc    |   1 +
 .../framework/ir/fusion_group/subgraph.h      |   1 +
 paddle/fluid/framework/ir/generate_pass.cc    | 320 +++----
 .../framework/ir/generate_pass_tester.cc      |  10 +-
 .../ir/gpu_cpu_map_matmul_to_mul_pass.cc      |   2 +-
 paddle/fluid/framework/ir/graph.cc            |   3 +-
 paddle/fluid/framework/ir/graph.h             |   1 +
 paddle/fluid/framework/ir/graph_helper.cc     |   7 +-
 .../fluid/framework/ir/graph_helper_test.cc   |   5 +-
 .../framework/ir/graph_pattern_detector.cc    |  15 +-
 .../ir/graph_pattern_detector_tester.cc       |  12 +-
 paddle/fluid/framework/ir/graph_printer.h     |   2 +
 paddle/fluid/framework/ir/graph_test.cc       |   1 +
 .../framework/ir/graph_to_program_pass.cc     |   1 +
 paddle/fluid/framework/ir/graph_traits.cc     |  17 +-
 paddle/fluid/framework/ir/graph_viz_pass.cc   |   2 +
 .../ir/identity_scale_op_clean_pass.cc        |  67 +-
 .../fluid/framework/ir/ipu/avg_shard_pass.cc  |   3 +-
 .../framework/ir/ipu/infer_shape_pass.cc      |   1 +
 .../ir/ipu/inference_process_pass.cc          |   5 +-
 .../ir/ipu/optimizer_state_align_pass.cc      |   1 +
 .../fluid/framework/ir/is_test_pass_tester.cc |   4 +-
 .../framework/ir/layer_norm_fuse_pass.cc      |   3 +-
 .../framework/ir/lock_free_optimize_pass.h    |  50 +-
 .../framework/ir/matmul_scale_fuse_pass.cc    |   2 +-
 ...uffer_shared_cross_op_memory_reuse_pass.cc |  10 +-
 .../buffer_shared_inplace_op_pass.cc          |   5 +-
 .../memory_optimization_var_info.h            |   1 +
 .../ir/memory_optimize_pass/op_graph_view.h   |  10 +-
 .../recurrent_op_eager_deletion_pass.cc       |   2 +-
 .../share_varinfo_into_cinn_pass.cc           |   1 +
 .../share_varinfo_into_cinn_pass_test.cc      |   1 +
 .../ir/mixed_precision_configure_pass.cc      |   8 +-
 .../ir/mkldnn/batch_norm_act_fuse_pass.cc     |   1 +
 .../mkldnn/batch_norm_act_fuse_pass_tester.cc |  57 +-
 .../compute_propagate_scales_mkldnn_pass.cc   |   4 +-
 .../compute_propagate_scales_mkldnn_pass.h    |   1 +
 ...conv_activation_mkldnn_fuse_pass_tester.cc |   5 +-
 .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.h    |   8 +-
 .../conv_bias_mkldnn_fuse_pass_tester.cc      |   6 +-
 ...onv_concat_relu_mkldnn_fuse_pass_tester.cc |   4 +-
 .../framework/ir/mkldnn/cpu_bfloat16_pass.cc  |   2 +-
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |   3 +-
 .../ir/mkldnn/cpu_quantize_pass_tester.cc     |   3 +-
 .../cpu_quantize_placement_pass_tester.cc     |   4 +-
 .../depthwise_conv_mkldnn_pass_tester.cc      |   3 +-
 .../ir/mkldnn/elt_act_mkldnn_fuse_pass.cc     |   1 +
 .../ir/mkldnn/fc_act_mkldnn_fuse_pass.cc      |   1 +
 .../mkldnn/fc_act_mkldnn_fuse_pass_tester.cc  |  32 +-
 .../fc_elementwise_add_mkldnn_fuse_pass.cc    |   1 +
 .../int8_scale_calculation_mkldnn_pass.cc     |  16 +-
 ...t8_scale_calculation_mkldnn_pass_tester.cc |   3 +-
 .../ir/mkldnn/interpolate_mkldnn_pass.cc      |   2 +
 .../matmul_transpose_reshape_fuse_pass.cc     |   3 +
 ...tmul_transpose_reshape_fuse_pass_tester.cc |   1 +
 .../matmul_v2_transpose_reshape_fuse_pass.cc  |   2 +
 .../mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc |   3 +-
 .../mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc  |   1 +
 .../ir/mkldnn/mkldnn_inplace_pass.cc          |   2 +
 .../framework/ir/mkldnn/mkldnn_inplace_pass.h |   1 +
 .../ir/mkldnn/mkldnn_inplace_pass_tester.cc   |   5 +-
 .../framework/ir/mkldnn/mkldnn_pass_util.h    |   1 +
 .../ir/mkldnn/mkldnn_placement_pass_tester.cc |   5 +-
 .../ir/mkldnn/multi_gru_fuse_pass.cc          |   2 +
 .../framework/ir/mkldnn/multi_gru_fuse_pass.h |   1 +
 .../ir/mkldnn/multi_gru_fuse_pass_tester.cc   |   3 +-
 .../ir/mkldnn/multi_gru_seq_fuse_pass.cc      |   2 +
 .../ir/mkldnn/multi_gru_seq_fuse_pass.h       |   1 +
 .../mkldnn/multi_gru_seq_fuse_pass_tester.cc  |   4 +-
 .../ir/mkldnn/quant_dequant_mkldnn_pass.cc    |  11 +-
 .../ir/mkldnn/quant_dequant_mkldnn_pass.h     |   1 +
 ...shape_transpose_matmul_mkldnn_fuse_pass.cc |   2 +
 ...ranspose_matmul_mkldnn_fuse_pass_tester.cc |   4 +-
 ...pe_transpose_matmul_v2_mkldnn_fuse_pass.cc |   2 +
 .../mkldnn/scale_matmul_fuse_pass_tester.cc   |   3 +-
 .../shuffle_channel_mkldnn_detect_pass.cc     |   3 +-
 ...uffle_channel_mkldnn_detect_pass_tester.cc |   1 +
 .../softplus_activation_mkldnn_fuse_pass.cc   |   1 +
 ...plus_activation_mkldnn_fuse_pass_tester.cc |  45 +-
 .../framework/ir/multi_batch_merge_pass.cc    |   1 +
 .../add_reader_dependency_pass.cc             |   1 +
 .../fix_op_run_order_pass.cc                  |   1 +
 .../fuse_all_reduce_op_pass.cc                |   1 +
 .../multi_devices_graph_pass.cc               |  32 +-
 .../multi_devices_graph_pass.h                |   2 +-
 .../set_reader_device_info_utils.cc           |   1 +
 .../ir/multihead_matmul_fuse_pass.cc          |  37 +-
 .../ir/multihead_matmul_fuse_pass_tester.cc   |   3 +-
 paddle/fluid/framework/ir/node_test.cc        |   1 +
 .../framework/ir/op_compat_sensible_pass.cc   |   2 +
 .../framework/ir/op_compat_sensible_pass.h    |   1 +
 .../ir/op_compat_sensible_pass_tester.cc      |   2 +-
 paddle/fluid/framework/ir/pass.cc             |   8 +-
 paddle/fluid/framework/ir/pass_test.cc        |  10 +-
 paddle/fluid/framework/ir/pass_test_util.cc   |   3 +-
 .../fluid/framework/ir/pass_tester_helper.h   |   1 +
 .../fluid/framework/ir/placement_pass_base.cc |   2 +
 .../ir/preln_skip_layernorm_fuse_pass.cc      |   4 +-
 .../ir/repeated_fc_relu_fuse_pass.cc          |  13 +-
 .../ir/repeated_fc_relu_fuse_pass_tester.cc   |   4 +-
 .../ir/runtime_context_cache_pass.cc          |   1 +
 .../ir/seqconv_eltadd_relu_fuse_pass.cc       |   1 +
 .../framework/ir/seqpool_concat_fuse_pass.cc  |   4 +-
 .../ir/seqpool_concat_fuse_pass_tester.cc     |   3 +-
 .../ir/seqpool_cvm_concat_fuse_pass.cc        |  36 +-
 .../ir/seqpool_cvm_concat_fuse_pass_tester.cc |   3 +-
 .../ir/shuffle_channel_detect_pass.cc         |   3 +-
 .../ir/simplify_with_basic_ops_pass_tester.cc |   4 +-
 .../framework/ir/skip_layernorm_fuse_pass.cc  |   4 +-
 .../ir/skip_layernorm_fuse_pass_tester.cc     |   4 +-
 .../framework/ir/squared_mat_sub_fuse_pass.cc |  10 +-
 .../ir/sync_batch_norm_pass_tester.cc         |   1 +
 .../ir/transpose_flatten_concat_fuse_pass.cc  |   1 +
 .../ir/trt_map_matmul_to_mul_pass.cc          |   2 +-
 .../ir/trt_multihead_matmul_fuse_pass.cc      |  37 +-
 .../ir/trt_skip_layernorm_fuse_pass.cc        |   4 +-
 .../ir/unsqueeze2_eltwise_fuse_pass_tester.cc |   4 +-
 .../fluid/framework/ir/yolo_box_fuse_pass.cc  |   2 +
 paddle/fluid/framework/lod_tensor.h           |   1 +
 paddle/fluid/framework/lod_tensor_array.h     |   1 +
 paddle/fluid/framework/lod_tensor_test.cc     |   3 +-
 paddle/fluid/framework/naive_executor.cc      |   2 +
 paddle/fluid/framework/naive_executor_test.cc |   3 +
 .../framework/new_executor/data_transfer.cc   |   7 +-
 .../framework/new_executor/event_manager.cc   |   1 +
 .../new_executor/executor_statistics.cc       |   6 +-
 .../new_executor/executor_statistics.h        |   1 +
 .../event_garbage_collector.cc                |   4 +-
 .../event_garbage_collector.h                 |   1 +
 .../garbage_collector/garbage_collector.cc    |   1 +
 .../garbage_collector/garbage_collector.h     |   1 +
 .../framework/new_executor/interpretercore.cc |  20 +-
 .../new_executor/interpretercore_util.cc      |  28 +-
 .../new_executor/interpretercore_util.h       |   3 +-
 .../new_executor/new_executor_defs.cc         |  22 +-
 .../new_executor/standalone_executor.cc       |   1 +
 .../new_executor/standalone_executor_test.cc  |   1 +
 .../framework/new_executor/stream_analyzer.cc |   1 +
 .../framework/new_executor/stream_analyzer.h  |   1 +
 .../new_executor/workqueue/event_count.h      |   1 +
 .../new_executor/workqueue/events_waiter.cc   |   2 +
 .../new_executor/workqueue/events_waiter.h    |   1 +
 .../workqueue/nonblocking_threadpool.h        |   1 +
 .../new_executor/workqueue/run_queue.h        |  26 +-
 .../new_executor/workqueue/workqueue.cc       |  15 +-
 .../new_executor/workqueue/workqueue.h        |  16 +-
 .../new_executor/workqueue/workqueue_test.cc  |  14 +-
 .../new_executor/workqueue/workqueue_utils.cc |   1 +
 .../new_executor/workqueue/workqueue_utils.h  |   1 +
 .../no_need_buffer_vars_inference.cc          |   2 +
 .../no_need_buffer_vars_inference_test.cc     |   1 +
 paddle/fluid/framework/op_def_api.cc          |   2 +
 paddle/fluid/framework/op_def_api.h           |   4 +-
 paddle/fluid/framework/op_desc.cc             |  28 +-
 paddle/fluid/framework/op_proto_maker.h       |   1 +
 paddle/fluid/framework/op_registry_test.cc    |   9 +-
 paddle/fluid/framework/op_version_proto.h     |   1 +
 .../framework/op_version_registry_test.cc     |   4 +-
 paddle/fluid/framework/operator.cc            |  19 +-
 paddle/fluid/framework/operator.h             |  17 +-
 .../framework/operator_exception_test.cc      |   3 +-
 .../fluid/framework/operator_kernel_configs.h |   1 +
 paddle/fluid/framework/operator_test.cc       |   4 +-
 .../framework/paddle2cinn/build_cinn_pass.cc  |  26 +-
 .../paddle2cinn/build_cinn_pass_test.cc       |   1 -
 .../framework/paddle2cinn/cinn_cache_key.cc   |   2 +-
 .../paddle2cinn/cinn_cache_key_test.cc        |   2 +
 .../framework/paddle2cinn/cinn_compiler.cc    |   6 +-
 .../framework/paddle2cinn/cinn_compiler.h     |   1 +
 .../paddle2cinn/cinn_compiler_test.cc         |   2 +-
 .../paddle2cinn/cinn_graph_symbolization.cc   |   2 +
 .../paddle2cinn/cinn_graph_symbolization.h    |   2 +
 .../cinn_graph_symbolization_test.cc          |   4 +-
 .../framework/paddle2cinn/transform_desc.h    |   3 +
 .../paddle2cinn/transform_desc_test.cc        |   2 +
 .../framework/paddle2cinn/transform_type.cc   |   1 +
 .../framework/paddle2cinn/transform_type.h    |   2 +-
 .../paddle2cinn/transform_type_test.cc        |   1 +
 paddle/fluid/framework/parallel_executor.cc   |   5 +-
 paddle/fluid/framework/parallel_executor.h    |   2 +-
 paddle/fluid/framework/phi_utils.cc           |   4 +-
 paddle/fluid/framework/phi_utils.h            |   3 +-
 paddle/fluid/framework/phi_utils_test.cc      |   1 +
 paddle/fluid/framework/program_desc.cc        |   1 +
 paddle/fluid/framework/program_desc.h         |   1 +
 paddle/fluid/framework/program_processing.cc  |   1 +
 paddle/fluid/framework/prune.cc               |   1 +
 paddle/fluid/framework/prune_test.cc          |   1 +
 paddle/fluid/framework/ps_gpu_trainer.cc      |   1 +
 paddle/fluid/framework/pull_dense_worker.cc   |   1 +
 paddle/fluid/framework/reader.cc              |   1 +
 paddle/fluid/framework/save_load_util.cc      |   5 +-
 paddle/fluid/framework/save_load_util_test.cc |   3 +-
 paddle/fluid/framework/scope_guard.h          |  13 +-
 paddle/fluid/framework/scope_guard_test.cc    |   1 +
 paddle/fluid/framework/section_worker.cc      |   1 +
 paddle/fluid/framework/selected_rows_utils.h  |   3 +-
 .../framework/selected_rows_utils_test.cc     |   4 +-
 paddle/fluid/framework/string_array.cc        |   3 +-
 paddle/fluid/framework/tensor.h               |   5 +-
 paddle/fluid/framework/tensor_impl.h          |  13 +-
 paddle/fluid/framework/tensor_test.cc         |   1 +
 paddle/fluid/framework/tensor_util.cc         |  14 +-
 paddle/fluid/framework/tensor_util_test.cc    |  98 +-
 paddle/fluid/framework/threadpool.cc          |   5 +-
 paddle/fluid/framework/threadpool_test.cc     |   2 +
 paddle/fluid/framework/trainer.cc             |   1 +
 paddle/fluid/framework/trainer_factory.cc     |   1 +
 paddle/fluid/framework/trainer_test.cc        |   5 +-
 paddle/fluid/framework/type_defs.h            |   1 +
 paddle/fluid/framework/unused_var_check.cc    |   1 +
 paddle/fluid/framework/unused_var_check.h     |   1 +
 paddle/fluid/framework/var_desc.cc            |  14 +-
 .../framework/var_type_inference_test.cc      |   9 +-
 paddle/fluid/framework/var_type_traits.cc     |   2 +
 .../fluid/framework/var_type_traits_test.cc   |   3 +-
 paddle/fluid/framework/version.cc             |   4 +-
 paddle/fluid/framework/version_test.cc        |   1 +
 paddle/fluid/imperative/all_reduce.cc         |   1 +
 paddle/fluid/imperative/amp_auto_cast.cc      |   7 +-
 paddle/fluid/imperative/basic_engine.h        |   1 +
 paddle/fluid/imperative/bkcl_context.cc       |  12 +-
 paddle/fluid/imperative/cncl_context.cc       |  18 +-
 paddle/fluid/imperative/data_loader.cc        |   1 +
 paddle/fluid/imperative/data_loader.h         |   1 +
 paddle/fluid/imperative/execution_context.h   |   1 +
 paddle/fluid/imperative/flags.cc              |   1 +
 paddle/fluid/imperative/gloo_context.cc       |   1 +
 paddle/fluid/imperative/gloo_context.h        |   1 +
 .../fluid/imperative/gradient_accumulator.cc  |  12 +-
 .../fluid/imperative/gradient_accumulator.h   |   1 +
 paddle/fluid/imperative/hccl_context.cc       |  18 +-
 .../fluid/imperative/infer_var_type_context.h |   1 +
 .../imperative/jit/program_desc_tracer.cc     |   1 +
 paddle/fluid/imperative/layer.cc              |   8 +-
 paddle/fluid/imperative/layout_autotune.cc    |   6 +-
 paddle/fluid/imperative/layout_autotune.h     |   2 +
 paddle/fluid/imperative/nccl_context.cc       |  11 +-
 paddle/fluid/imperative/op_base.h             |   1 +
 .../fluid/imperative/partial_grad_engine.cc   |   1 +
 paddle/fluid/imperative/partial_grad_engine.h |   1 +
 paddle/fluid/imperative/prepared_operator.cc  |   4 +-
 paddle/fluid/imperative/prepared_operator.h   |   3 +-
 paddle/fluid/imperative/profiler.cc           |   2 +
 paddle/fluid/imperative/py_layer_fwd.h        |   6 +-
 paddle/fluid/imperative/reducer.cc            |  12 +-
 paddle/fluid/imperative/reducer.h             |   1 +
 .../imperative/tests/bkcl_context_test.cc     |   4 +-
 .../imperative/tests/cncl_context_test.cc     |   6 +-
 .../tests/heter_ccl_context_test.cc           |   6 +-
 .../imperative/tests/nccl_context_test.cc     |   6 +-
 paddle/fluid/imperative/tests/test_eager.cc   |   5 +-
 .../tests/test_gradient_accmulator.cc         |   4 +-
 paddle/fluid/imperative/tests/test_group.cc   |   2 +-
 .../fluid/imperative/tests/test_prepare_op.cc |   2 +
 paddle/fluid/imperative/tracer.cc             |   2 +
 paddle/fluid/imperative/tracer.h              |   1 +
 paddle/fluid/imperative/var_helper.h          |   1 +
 .../fluid/inference/analysis/analysis_pass.h  |   1 +
 paddle/fluid/inference/analysis/analyzer.cc   |   2 +
 paddle/fluid/inference/analysis/analyzer.h    |   1 +
 .../inference/analysis/analyzer_tester.cc     |   4 +-
 paddle/fluid/inference/analysis/dot.h         |   1 +
 paddle/fluid/inference/analysis/dot_tester.cc |   5 +-
 paddle/fluid/inference/analysis/helper.h      |   6 +-
 .../inference/analysis/ir_pass_manager.cc     |   4 +-
 .../inference/analysis/ir_pass_manager.h      |   1 +
 .../analysis/ir_passes/dlnne_subgraph_pass.cc |  41 +-
 .../analysis/ir_passes/lite_subgraph_pass.cc  |  20 +-
 .../analysis/ir_passes/lite_subgraph_pass.h   |   2 +
 .../ir_passes/lite_subgraph_pass_tester.cc    |   5 +-
 .../analysis/passes/ir_analysis_pass.cc       |   2 +
 .../analysis/passes/ir_analysis_pass.h        |   1 +
 .../analysis/passes/ir_graph_build_pass.cc    |   2 +
 .../analysis/passes/ir_graph_build_pass.h     |   1 +
 .../passes/ir_graph_to_program_pass.cc        |   1 +
 .../passes/ir_graph_to_program_pass.h         |   1 +
 .../ir_params_sync_among_devices_pass.cc      |   1 +
 .../analysis/passes/memory_optimize_pass.cc   |   3 +-
 .../analysis/passes/memory_optimize_pass.h    |  19 +-
 .../fluid/inference/analysis/passes/passes.cc |   1 +
 .../fluid/inference/analysis/passes/passes.h  |   1 +
 paddle/fluid/inference/analysis/ut_helper.h   |   2 +
 paddle/fluid/inference/api/analysis_config.cc |   6 +-
 .../fluid/inference/api/analysis_predictor.cc |   7 +-
 .../fluid/inference/api/analysis_predictor.h  |   8 +-
 .../api/analysis_predictor_tester.cc          |   2 +
 paddle/fluid/inference/api/api.cc             |   1 +
 paddle/fluid/inference/api/api_impl.cc        |   9 +-
 paddle/fluid/inference/api/api_impl.h         |   1 +
 paddle/fluid/inference/api/api_tester.cc      |   1 +
 .../api/demo_ci/onnxruntime_mobilenet_demo.cc |   2 +
 .../api/demo_ci/trt_mobilenet_demo.cc         |   1 +
 paddle/fluid/inference/api/demo_ci/utils.h    |   2 +
 .../fluid/inference/api/demo_ci/vis_demo.cc   |   1 +
 .../api/demo_ci/windows_mobilenet.cc          |   3 +-
 .../inference/api/details/zero_copy_tensor.cc |  10 +-
 paddle/fluid/inference/api/helper.cc          |   1 +
 paddle/fluid/inference/api/helper.h           |   6 +-
 paddle/fluid/inference/api/infer_context.h    |  10 +-
 .../fluid/inference/api/mkldnn_quantizer.cc   |  17 +-
 paddle/fluid/inference/api/mkldnn_quantizer.h |   1 +
 .../inference/api/mkldnn_quantizer_tester.cc  |   3 +-
 .../inference/api/onnxruntime_predictor.h     |   6 +-
 .../api/onnxruntime_predictor_tester.cc       |   5 +-
 .../inference/api/paddle_analysis_config.h    |  13 +-
 paddle/fluid/inference/api/paddle_api.h       |  11 +-
 .../inference/api/paddle_infer_contrib.cc     |   1 +
 .../inference/api/paddle_pass_builder.cc      |   1 +
 paddle/fluid/inference/api/resource_manager.h |   1 +
 paddle/fluid/inference/capi/c_api.cc          |   1 +
 paddle/fluid/inference/capi/c_api_internal.h  |   1 +
 paddle/fluid/inference/capi/pd_config.cc      |   1 +
 paddle/fluid/inference/capi/pd_predictor.cc   |   1 +
 paddle/fluid/inference/capi/pd_tensor.cc      |   1 +
 paddle/fluid/inference/capi_exp/lod_demo.cc   |   2 +
 paddle/fluid/inference/capi_exp/pd_config.cc  |   1 +
 .../fluid/inference/capi_exp/pd_predictor.cc  |   1 +
 paddle/fluid/inference/capi_exp/pd_tensor.cc  |   1 +
 paddle/fluid/inference/capi_exp/pd_utils.cc   |   3 +-
 .../com_baidu_paddle_inference_Config.cpp     |   3 +-
 .../com_baidu_paddle_inference_Predictor.cpp  |   2 +
 .../com_baidu_paddle_inference_Tensor.cpp     |   2 +
 .../javaapi/native/jni_convert_util.h         |   5 +-
 paddle/fluid/inference/io.h                   |   1 +
 paddle/fluid/inference/lite/engine.cc         |   1 +
 paddle/fluid/inference/lite/op_teller.cc      |   3 +-
 paddle/fluid/inference/lite/op_teller.h       |   1 +
 paddle/fluid/inference/lite/tensor_utils.cc   |   6 +-
 .../fluid/inference/lite/test_engine_lite.cc  |   6 +-
 .../fluid/inference/lite/test_tensor_utils.cc |   5 +-
 .../tensorrt/convert/activation_op.cc         |   1 +
 .../tensorrt/convert/deformable_conv_op.cc    |   1 +
 .../convert/flatten_contiguous_range_op.cc    |   9 +-
 .../tensorrt/convert/group_norm_op.cc         |   1 +
 .../tensorrt/convert/io_converter.cc          |  19 +-
 .../inference/tensorrt/convert/io_converter.h |   1 +
 .../tensorrt/convert/multiclass_nms3_op.cc    |   1 +
 .../tensorrt/convert/multiclass_nms_op.cc     |   1 +
 .../inference/tensorrt/convert/op_converter.h |  11 +-
 .../inference/tensorrt/convert/softmax_op.cc  |   1 +
 .../tensorrt/convert/test_activation_op.cc    |   1 +
 .../tensorrt/convert/test_batch_norm_op.cc    |   1 +
 .../tensorrt/convert/test_concat_op.cc        |   1 +
 .../tensorrt/convert/test_conv2d_op.cc        |   1 +
 .../tensorrt/convert/test_dropout_op.cc       |   1 +
 .../tensorrt/convert/test_elementwise_op.cc   |   1 +
 .../inference/tensorrt/convert/test_fc_op.cc  |   1 +
 .../tensorrt/convert/test_io_converter.cc     |   1 +
 .../tensorrt/convert/test_leaky_relu_op.cc    |   1 +
 .../tensorrt/convert/test_mish_op.cc          |   1 +
 .../inference/tensorrt/convert/test_mul_op.cc |   1 +
 .../convert/test_nearest_interp_v2_op.cc      |   1 +
 .../tensorrt/convert/test_op_converter.cc     |   3 +-
 .../inference/tensorrt/convert/test_pad_op.cc |   1 +
 .../tensorrt/convert/test_pool2d_op.cc        |   2 +
 .../tensorrt/convert/test_prelu_op.cc         |   1 +
 .../convert/test_shuffle_channel_op.cc        |   1 +
 .../tensorrt/convert/test_softmax_op.cc       |   1 +
 .../tensorrt/convert/test_split_op.cc         |   1 +
 .../tensorrt/convert/test_swish_op.cc         |   1 +
 .../inference/tensorrt/convert/unary_op.cc    |   2 +
 .../inference/tensorrt/convert/yolo_box_op.cc |   1 +
 paddle/fluid/inference/tensorrt/engine.cc     |   1 +
 paddle/fluid/inference/tensorrt/engine.h      |   3 +-
 paddle/fluid/inference/tensorrt/helper.h      |   2 +
 paddle/fluid/inference/tensorrt/op_teller.cc  |   2 +
 paddle/fluid/inference/tensorrt/op_teller.h   |   1 +
 .../plugin/anchor_generator_op_plugin.cu      |   1 +
 .../plugin/deformable_conv_op_plugin.cu       |  35 +-
 .../tensorrt/plugin/elementwise_op_plugin.cu  |  19 +-
 .../tensorrt/plugin/elementwise_op_plugin.h   |   1 +
 .../plugin/emb_eltwise_layernorm_plugin.cu    |  11 +-
 .../tensorrt/plugin/gather_nd_op_plugin.h     |   2 +
 .../tensorrt/plugin/gelu_op_plugin.cu         |  30 +-
 .../tensorrt/plugin/gelu_op_plugin.h          |   2 +
 .../tensorrt/plugin/hard_swish_op_plugin.cu   |   1 +
 .../tensorrt/plugin/hard_swish_op_plugin.h    |   2 +
 .../plugin/instance_norm_op_plugin.cu         |   2 +
 .../tensorrt/plugin/layer_norm_op_plugin.cu   |   2 +
 .../tensorrt/plugin/layer_norm_op_plugin.h    |   1 +
 .../tensorrt/plugin/matmul_op_int8_plugin.h   |   2 +-
 .../tensorrt/plugin/mish_op_plugin.cu         |  37 +-
 .../tensorrt/plugin/mish_op_plugin.h          |   2 +
 .../tensorrt/plugin/pool3d_op_plugin.cu       |   9 +-
 .../tensorrt/plugin/pool3d_op_plugin.h        |   2 +
 .../tensorrt/plugin/pool_op_plugin.cu         |   9 +-
 .../tensorrt/plugin/pool_op_plugin.h          |   2 +
 .../tensorrt/plugin/prelu_op_plugin.cu        |   9 +-
 .../tensorrt/plugin/prelu_op_plugin.h         |   2 +-
 .../tensorrt/plugin/qkv_to_context_plugin.cu  |  19 +-
 .../tensorrt/plugin/recover_padding_plugin.h  |   2 +-
 .../tensorrt/plugin/remove_padding_plugin.h   |   2 +-
 .../tensorrt/plugin/roi_align_op_plugin.cu    |  14 +-
 .../plugin/skip_layernorm_op_plugin.cu        |   7 +-
 .../tensorrt/plugin/slice_op_plugin.cu        |  16 +-
 .../tensorrt/plugin/split_op_plugin.cu        |   2 +
 .../tensorrt/plugin/split_op_plugin.h         |   2 +
 .../tensorrt/plugin/stack_op_plugin.cu        |   6 +-
 .../tensorrt/plugin/stack_op_plugin.h         |   2 +
 .../tensorrt/plugin/swish_op_plugin.cu        |  15 +-
 .../tensorrt/plugin/test_split_plugin.cc      |   1 +
 .../plugin/transformer_input_convert_plugin.h |   2 +-
 .../inference/tensorrt/plugin/trt_plugin.h    |   1 +
 .../tensorrt/plugin/trt_plugin_utils.h        |   1 +
 .../tensorrt/plugin/yolo_box_head_op_plugin.h |   1 +
 .../fluid/inference/tensorrt/test_tensorrt.cc |   1 +
 .../inference/tensorrt/trt_int8_calibrator.h  |   1 +
 .../tests/api/analyzer_capi_exp_gpu_tester.cc |   2 +
 .../tests/api/analyzer_capi_exp_int_tester.cc |   2 +
 .../tests/api/analyzer_capi_exp_ner_tester.cc |   2 +
 .../api/analyzer_capi_exp_pd_config_tester.cc |   2 +
 .../api/analyzer_capi_exp_pd_tensor_tester.cc |   2 +
 .../analyzer_capi_exp_pd_threads_tester.cc    |   2 +
 .../tests/api/analyzer_capi_exp_xpu_tester.cc |   2 +
 .../tests/api/analyzer_capi_gpu_tester.cc     |   2 +
 .../tests/api/analyzer_capi_int_tester.cc     |   2 +
 .../tests/api/analyzer_capi_ner_tester.cc     |   2 +
 .../api/analyzer_capi_pd_tensor_tester.cc     |   7 +-
 .../tests/api/analyzer_capi_tester.cc         |   2 +
 .../tests/api/analyzer_capi_xpu_tester.cc     |   2 +
 .../tests/api/analyzer_dam_tester.cc          |   1 +
 ...nalyzer_detect_functional_mkldnn_tester.cc |   2 +
 .../tests/api/analyzer_detect_tester.cc       |   2 +
 .../analyzer_image_classification_tester.cc   |   1 +
 ...alyzer_int8_image_classification_tester.cc |   1 +
 .../analyzer_int8_object_detection_tester.cc  |   1 +
 .../tests/api/analyzer_lac_tester.cc          |   5 +-
 .../analyzer_lexical_analysis_gru_tester.cc   |  22 +-
 .../tests/api/analyzer_mmp_tester.cc          |   9 +-
 .../api/analyzer_paddle_tensor_tester.cc      |   3 +-
 ...lyzer_quant_image_classification_tester.cc |   1 +
 .../tests/api/analyzer_seq_conv1_tester.cc    |   5 +-
 ...yzer_seq_pool1_compare_determine_tester.cc |   1 +
 .../api/analyzer_seq_pool1_compare_tester.cc  |   1 +
 ...seq_pool1_fuse_compare_zero_copy_tester.cc |   1 +
 .../analyzer_seq_pool1_fuse_statis_tester.cc  |   1 +
 .../api/analyzer_seq_pool1_profile_tester.cc  |   1 +
 .../api/analyzer_seq_pool1_tester_helper.h    |   1 +
 .../api/analyzer_transformer_tester_helper.h  |   1 +
 .../tests/api/analyzer_vis_tester.cc          |   2 +
 .../tests/api/analyzer_vit_ocr_tester.cc      |   1 +
 .../api/analyzer_zerocopy_tensor_tester.cc    |   3 +-
 .../inference/tests/api/config_printer.h      |   1 +
 .../tests/api/ipu_resnet50_fp16_test.cc       |   1 +
 .../inference/tests/api/ipu_resnet50_test.cc  |   1 +
 .../tests/api/ipu_word2vec_sample.cc          |   2 +-
 .../tests/api/lite_mul_model_test.cc          |   3 +-
 .../inference/tests/api/lite_resnet50_test.cc |   1 +
 .../api/mkldnn_quantizer_config_tester.cc     |   8 +-
 .../paddle_infer_api_copy_tensor_tester.cc    |   2 +
 .../api/paddle_infer_api_errors_tester.cc     |   1 -
 .../tests/api/paddle_infer_api_test.cc        |   3 +-
 .../fluid/inference/tests/api/tester_helper.h |   4 +-
 .../tests/api/trt_cascade_rcnn_test.cc        |   2 +-
 ...e_ernie_fp16_serialize_deserialize_test.cc |   2 +-
 ..._shape_ernie_serialize_deserialize_test.cc |   2 +-
 ...c_shape_ernie_serialize_deserialize_test.h |   2 +-
 .../tests/api/trt_dynamic_shape_ernie_test.cc |  79 +-
 .../tests/api/trt_dynamic_shape_test.cc       |   2 +-
 ...rt_dynamic_shape_transformer_prune_test.cc |   2 +-
 .../inference/tests/api/trt_fc_prelu_test.cc  |   2 +-
 .../api/trt_instance_norm_converter_test.cc   |   2 +-
 .../inference/tests/api/trt_mobilenet_test.cc |   2 +-
 .../tests/api/trt_quant_int8_test.cc          |   3 +-
 .../api/trt_quant_int8_yolov3_r50_test.cc     |   3 +-
 .../inference/tests/api/trt_resnet50_test.cc  |   2 +-
 .../inference/tests/api/trt_resnext_test.cc   |   2 +-
 .../tests/api/trt_split_converter_test.cc     |   2 +-
 .../inference/tests/api/trt_test_helper.h     |   2 +-
 .../inference/tests/infer_ut/test_suite.h     |   8 +-
 .../fluid/inference/utils/benchmark_tester.cc |   3 +-
 paddle/fluid/inference/utils/io_utils.cc      |   5 +-
 .../fluid/inference/utils/io_utils_tester.cc  |   4 +-
 paddle/fluid/inference/utils/singleton.h      |   1 +
 .../inference/utils/table_printer_tester.cc   |   3 +-
 .../memory/allocation/allocator_facade.cc     |   1 +
 .../memory/allocation/allocator_facade.h      |   1 +
 .../allocator_facade_abs_flags_test.cc        |   3 +-
 .../auto_growth_best_fit_allocator.cc         |   1 +
 ...o_growth_best_fit_allocator_facade_test.cc |   2 +
 .../auto_growth_best_fit_allocator_test.cc    |   6 +-
 .../memory/allocation/best_fit_allocator.cc   |   1 +
 .../memory/allocation/best_fit_allocator.h    |   1 +
 .../fluid/memory/allocation/cuda_allocator.cc |   1 +
 .../fluid/memory/allocation/cuda_allocator.h  |   1 +
 .../memory/allocation/cuda_ipc_allocator.cc   |   3 +-
 .../allocation/cuda_managed_allocator.cc      |   1 +
 .../allocation/cuda_virtual_mem_allocator.cc  |   1 +
 .../allocation/cuda_virtual_mem_allocator.h   |   2 +
 .../memory/allocation/custom_allocator.cc     |   1 +
 .../memory/allocation/custom_allocator.h      |   1 +
 .../fluid/memory/allocation/mmap_allocator.cc |  13 +-
 .../allocation/naive_best_fit_allocator.cc    |   1 -
 .../allocation/naive_best_fit_allocator.h     |   1 +
 .../fluid/memory/allocation/npu_allocator.cc  |   2 +
 .../fluid/memory/allocation/npu_allocator.h   |   1 +
 .../memory/allocation/pinned_allocator.cc     |   1 +
 .../memory/allocation/retry_allocator.cc      |   5 +-
 .../memory/allocation/retry_allocator_test.cc |   1 +
 .../allocation/stream_safe_cuda_allocator.cc  |   1 +
 .../allocation/stream_safe_cuda_allocator.h   |   1 +
 .../allocation/thread_local_allocator_test.cc |   2 +
 ...l_memory_auto_growth_best_fit_allocator.cc |   3 +-
 paddle/fluid/memory/buffer.h                  |   1 +
 .../fluid/memory/detail/system_allocator.cc   |  30 +-
 paddle/fluid/memory/detail/system_allocator.h |   1 +
 paddle/fluid/memory/get_base_ptr_test.cu      |   1 +
 paddle/fluid/memory/malloc.h                  |   4 +-
 paddle/fluid/memory/memory_stats_test.cc      |   3 +-
 paddle/fluid/memory/pinned_memory_test.cu     |   2 +-
 paddle/fluid/memory/stats.h                   |  20 +-
 paddle/fluid/memory/stats_test.cc             |   2 +
 .../memory/stream_safe_cuda_alloc_test.cu     |   7 +-
 paddle/fluid/operators/abs_op.cc              |   1 +
 .../fluid/operators/activation_cudnn_op.cu.cc |   2 +-
 paddle/fluid/operators/activation_op.cc       |  50 +-
 paddle/fluid/operators/activation_op.h        |  10 +-
 paddle/fluid/operators/activation_op_xpu.cc   |  20 +-
 .../operators/add_position_encoding_op.cc     |   1 +
 paddle/fluid/operators/addmm_op.cc            |   1 +
 paddle/fluid/operators/affine_channel_op.cc   |   1 +
 paddle/fluid/operators/affine_channel_op.cu   |  40 +-
 .../fluid/operators/affine_channel_op_xpu.cc  |   1 +
 .../operators/affine_grid_cudnn_op.cu.cc      |   5 +-
 paddle/fluid/operators/affine_grid_op.cc      |   2 +
 paddle/fluid/operators/affine_grid_op.cu      |   4 +-
 paddle/fluid/operators/affine_grid_op.h       |   1 +
 .../amp/alloc_float_status_op_npu.cc          |   1 +
 .../amp/check_finite_and_unscale_op.cu        |   8 +-
 .../check_finite_and_unscale_op_npu_test.cc   |   1 +
 .../amp/check_finite_and_unscale_op_xpu.cc    |  61 +-
 .../amp/clear_float_status_op_npu.cc          |   1 +
 .../operators/amp/get_float_status_op_npu.cc  |   1 +
 .../operators/amp/update_loss_scaling_op.cc   |   2 +
 .../operators/amp/update_loss_scaling_op.cu   |   1 +
 .../operators/amp/update_loss_scaling_op.h    |   1 +
 .../amp/update_loss_scaling_op_npu.cc         |   3 +-
 .../amp/update_loss_scaling_op_xpu.cc         |  12 +-
 paddle/fluid/operators/angle_op.h             |   2 +-
 paddle/fluid/operators/arg_max_op.cc          |  33 +-
 paddle/fluid/operators/arg_min_max_op_base.h  |   1 +
 paddle/fluid/operators/arg_min_op.cc          |  30 +-
 paddle/fluid/operators/array_operator.h       |   1 +
 .../fluid/operators/array_to_lod_tensor_op.cc |   1 +
 paddle/fluid/operators/ascend_trigger_op.h    |   1 +
 paddle/fluid/operators/assign_op_xpu.cc       |   4 +-
 paddle/fluid/operators/attention_lstm_op.cc   |  34 +-
 .../fluid/operators/average_accumulates_op.h  |   1 +
 paddle/fluid/operators/batch_fc_op.cc         |   6 +-
 paddle/fluid/operators/batch_fc_op.cu         |   1 +
 paddle/fluid/operators/batch_norm_op.cc       |  11 +-
 paddle/fluid/operators/batch_norm_op.h        |   1 +
 paddle/fluid/operators/batch_norm_op_mlu.cc   |   2 +-
 paddle/fluid/operators/batch_norm_op_npu.cc   |  14 +-
 paddle/fluid/operators/batch_norm_op_xpu.cc   |  44 +-
 paddle/fluid/operators/batch_size_like.h      |   1 +
 .../fluid/operators/beam_search_decode_op.cc  |   3 +-
 .../operators/beam_search_decode_op_test.cc   |   8 +-
 paddle/fluid/operators/beam_search_op.cc      |   1 +
 paddle/fluid/operators/beam_search_op.cu.cc   |   1 +
 paddle/fluid/operators/beam_search_op_npu.cc  |   2 +-
 paddle/fluid/operators/benchmark/op_tester.cc |   2 +
 paddle/fluid/operators/benchmark/op_tester.h  |   1 +
 .../operators/benchmark/op_tester_config.cc   |   2 +
 paddle/fluid/operators/bilateral_slice_op.cc  |   2 +
 paddle/fluid/operators/bilateral_slice_op.cu  |  41 +-
 paddle/fluid/operators/bilateral_slice_op.h   |   1 +
 paddle/fluid/operators/bmm_op.cc              |   1 +
 paddle/fluid/operators/bmm_op.h               |   1 +
 paddle/fluid/operators/bmm_op_xpu.cc          |   2 +-
 paddle/fluid/operators/bpr_loss_op.cc         |   1 +
 paddle/fluid/operators/bpr_loss_op.h          |   5 +-
 .../fluid/operators/broadcast_tensors_op.cc   |   2 +-
 paddle/fluid/operators/cast_op.cc             |   2 +
 paddle/fluid/operators/cast_op.h              |   1 -
 paddle/fluid/operators/cast_op_xpu.cc         |   3 +-
 paddle/fluid/operators/center_loss_op.cc      |   1 +
 paddle/fluid/operators/center_loss_op.cu      |   1 +
 paddle/fluid/operators/center_loss_op.h       |   1 +
 paddle/fluid/operators/chunk_eval_op.cc       |  12 +-
 .../operators/cinn/cinn_instruction_run_op.cc |  13 +-
 .../cinn/cinn_instruction_run_op.cu.cc        |   1 +
 .../operators/cinn/cinn_instruction_run_op.h  |   1 +
 .../cinn/cinn_instruction_run_op_test.cc      |   2 +
 .../operators/cinn/cinn_launch_context.cc     |   6 +-
 .../operators/cinn/cinn_launch_context.h      |   1 +
 .../cinn/cinn_launch_context_test.cc          |   6 +-
 paddle/fluid/operators/cinn/cinn_launch_op.cc |   2 +
 .../fluid/operators/cinn/cinn_launch_op.cu.cc |   1 +
 paddle/fluid/operators/cinn/cinn_launch_op.h  |  20 +-
 .../operators/cinn/cinn_launch_op_test.cc     |   3 +
 paddle/fluid/operators/cinn/cinn_op_helper.cc |   1 +
 paddle/fluid/operators/cinn/cinn_op_helper.h  |   1 +
 paddle/fluid/operators/cinn/test_helper.h     |   1 +
 .../fluid/operators/class_center_sample_op.cu |   3 +
 .../fluid/operators/class_center_sample_op.h  |   1 +
 paddle/fluid/operators/clip_by_norm_op.h      |   9 +-
 paddle/fluid/operators/clip_by_norm_op_xpu.cc |   3 +-
 paddle/fluid/operators/clip_op.cc             |  20 +-
 paddle/fluid/operators/clip_op_xpu.cc         |   9 +-
 paddle/fluid/operators/coalesce_tensor_op.cc  |  21 +-
 .../operators/collective/allreduce_op.cc      |   4 +-
 .../fluid/operators/collective/barrier_op.h   |   1 +
 .../operators/collective/broadcast_op.cc      |   1 +
 .../operators/collective/c_allgather_op.cc    |   5 +-
 .../operators/collective/c_allgather_op.h     |   1 +
 .../collective/c_allgather_op_npu.cc          |   4 +-
 .../collective/c_allgather_op_npu_test.cc     |   7 +-
 .../collective/c_allreduce_max_op_npu_test.cc |   7 +-
 .../operators/collective/c_allreduce_op.h     |  10 +-
 .../collective/c_allreduce_sum_op_npu_test.cc |   7 +-
 .../operators/collective/c_broadcast_op.h     |   1 +
 .../collective/c_broadcast_op_npu_test.cc     |   7 +-
 .../collective/c_comm_init_all_op.cc          |   7 +-
 .../collective/c_comm_init_multitrainer_op.cc |   1 +
 .../operators/collective/c_comm_init_op.cc    |   5 +-
 .../fluid/operators/collective/c_concat_op.cc |  14 +-
 .../operators/collective/c_concat_op.cu.cc    |   3 +-
 .../operators/collective/c_gen_bkcl_id_op.cc  |   3 +-
 .../operators/collective/c_gen_cncl_id_op.cc  |   4 +-
 .../operators/collective/c_gen_hccl_id_op.cc  |   5 +-
 .../operators/collective/c_gen_nccl_id_op.cc  |   3 +-
 .../fluid/operators/collective/c_reduce_op.h  |  17 +-
 .../collective/c_reduce_sum_op_npu_test.cc    |   7 +-
 .../collective/c_reducescatter_op_npu_test.cc |   7 +-
 .../fluid/operators/collective/c_scatter_op.h |   1 +
 .../c_softmax_with_cross_entropy_op.cu        |  16 +-
 .../fluid/operators/collective/c_split_op.cc  |  14 +-
 .../fluid/operators/collective/c_split_op.cu  |   9 +-
 .../c_sync_comm_stream_op_npu_test.cc         |   5 +-
 .../collective/checknumeric_npu_test.cc       |   7 +-
 .../operators/collective/gen_bkcl_id_op.cc    |  10 +-
 .../operators/collective/gen_hccl_id_op.cc    |  10 +-
 .../collective/gen_hccl_id_op_helper.cc       |   1 +
 .../operators/collective/gen_nccl_id_op.cc    |   7 +-
 .../collective/partial_allgather_op.cc        |   5 +-
 .../collective/partial_allgather_op_npu.cc    |   2 +-
 .../operators/collective/partial_recv_op.cc   |   1 +
 .../collective/partial_recv_op_npu.cc         |   6 +-
 .../collective/partial_send_op_npu.cc         |   6 +-
 .../fluid/operators/collective/recv_v2_op.cc  |   1 +
 .../operators/collective/recv_v2_op_npu.cc    |   5 +-
 .../collective/recv_v2_op_npu_test.cc         |   7 +-
 .../operators/collective/send_v2_op_npu.cc    |   5 +-
 .../collective/send_v2_op_npu_test.cc         |   8 +-
 .../operators/common_infer_shape_functions.cc |  13 +-
 paddle/fluid/operators/complex_op.cc          |   1 +
 paddle/fluid/operators/complex_view_op.cc     |   1 +
 paddle/fluid/operators/complex_view_op.cu     |   3 +-
 paddle/fluid/operators/concat_op.cc           |   3 +-
 paddle/fluid/operators/concat_op.h            |   2 +-
 paddle/fluid/operators/concat_op_mlu.cc       |   9 +-
 paddle/fluid/operators/concat_op_xpu.cc       |  37 +-
 paddle/fluid/operators/conj_op.cc             |   5 +-
 paddle/fluid/operators/conj_op.cu             |   5 +-
 .../fluid/operators/controlflow/bitwise_op.cc |   1 +
 .../fluid/operators/controlflow/compare_op.cc |  14 +-
 .../controlflow/conditional_block_op.h        |   9 +-
 .../fluid/operators/controlflow/fetch_op.cc   |  20 +-
 .../operators/controlflow/fetch_v2_op.cc      |  20 +-
 .../operators/controlflow/get_places_op.cc    |   7 +-
 .../fluid/operators/controlflow/logical_op.cc |   1 +
 .../fluid/operators/controlflow/op_variant.h  |   5 +-
 .../controlflow/recurrent_op_helper.cc        |   1 +
 .../fluid/operators/controlflow/while_op.cc   |  11 +-
 .../operators/controlflow/while_op_helper.cc  |   1 +
 paddle/fluid/operators/conv_base_helper.h     |   1 +
 paddle/fluid/operators/conv_cudnn_op_cache.h  |   1 +
 paddle/fluid/operators/conv_op.cc             |  34 +-
 paddle/fluid/operators/conv_op.h              |   1 +
 paddle/fluid/operators/conv_op_npu.cc         |  98 +-
 paddle/fluid/operators/conv_op_xpu.cc         |   3 +-
 paddle/fluid/operators/conv_shift_op.cc       |   2 +
 paddle/fluid/operators/conv_transpose_op.cc   |   1 +
 .../fluid/operators/conv_transpose_op_npu.cc  |  40 +-
 .../fluid/operators/conv_transpose_op_xpu.cc  |   4 +-
 paddle/fluid/operators/correlation_op.cc      |   1 +
 paddle/fluid/operators/correlation_op.cu      |  31 +-
 paddle/fluid/operators/cos_sim_op.cc          |   1 +
 paddle/fluid/operators/crf_decoding_op.cc     |  11 +-
 paddle/fluid/operators/crf_decoding_op.h      |   3 +-
 paddle/fluid/operators/crop_op.cc             |   1 +
 paddle/fluid/operators/crop_op.h              |  23 +-
 paddle/fluid/operators/crop_tensor_op.cc      |   1 +
 paddle/fluid/operators/crop_tensor_op.h       |  23 +-
 paddle/fluid/operators/cross_entropy_op.cc    |   1 +
 paddle/fluid/operators/cross_op.cc            |   3 +-
 paddle/fluid/operators/ctc_align_op.cu        |  10 +-
 paddle/fluid/operators/ctc_align_op.h         |   2 +
 paddle/fluid/operators/cudnn_lstm_cache.h     |   1 +
 paddle/fluid/operators/cudnn_lstm_op.cc       |   1 +
 paddle/fluid/operators/cudnn_rnn_cache.h      |   1 +
 paddle/fluid/operators/cumsum_op.cc           |  15 +-
 paddle/fluid/operators/cvm_op.cc              |   2 +
 paddle/fluid/operators/data_norm_op.cc        |  32 +-
 paddle/fluid/operators/data_norm_op.cu        |  11 +-
 paddle/fluid/operators/decode_jpeg_op.cu      |   1 +
 paddle/fluid/operators/deformable_conv_op.cc  |   1 +
 .../fluid/operators/deformable_conv_op_xpu.cc |  46 +-
 .../fluid/operators/deformable_conv_v1_op.cc  |   1 +
 .../operators/deformable_psroi_pooling_op.cc  |  13 +-
 .../operators/deformable_psroi_pooling_op.cu  |   2 +
 .../operators/deformable_psroi_pooling_op.h   |   1 +
 paddle/fluid/operators/dequantize_op.cc       |  11 +-
 paddle/fluid/operators/dequantize_op.h        |   1 +
 paddle/fluid/operators/dequeue_op.cc          |   1 +
 .../operators/detection/anchor_generator_op.h |   1 +
 paddle/fluid/operators/detection/bbox_util.h  |   6 +-
 .../fluid/operators/detection/box_clip_op.cc  |   1 +
 .../fluid/operators/detection/box_clip_op.cu  |   1 +
 .../fluid/operators/detection/box_clip_op.h   |   1 +
 .../fluid/operators/detection/box_coder_op.cc |   1 +
 .../fluid/operators/detection/box_coder_op.cu |   1 +
 .../fluid/operators/detection/box_coder_op.h  |   1 +
 .../detection/box_decoder_and_assign_op.h     |   1 +
 .../detection/collect_fpn_proposals_op.cc     |   1 +
 .../detection/collect_fpn_proposals_op.cu     |   1 +
 .../detection/collect_fpn_proposals_op.h      |   1 +
 .../detection/density_prior_box_op.h          |   1 +
 .../detection/distribute_fpn_proposals_op.cc  |   1 +
 .../detection/distribute_fpn_proposals_op.cu  |   1 +
 .../detection/distribute_fpn_proposals_op.h   |   1 +
 .../detection/generate_mask_labels_op.cc      |   2 +
 .../detection/generate_proposal_labels_op.cc  |   2 +
 .../detection/generate_proposals_op.cc        |   1 +
 .../detection/generate_proposals_op.cu        |   2 +
 .../detection/generate_proposals_v2_op.cc     |   1 +
 .../detection/generate_proposals_v2_op.cu     |   2 +
 paddle/fluid/operators/detection/gpc.cc       |   1 +
 .../detection/locality_aware_nms_op.cc        |  22 +-
 paddle/fluid/operators/detection/mask_util.cc |   2 +
 paddle/fluid/operators/detection/mask_util.h  |   1 +
 .../operators/detection/mask_util_test.cc     |   2 +
 .../operators/detection/matrix_nms_op.cc      |   7 +-
 .../operators/detection/multiclass_nms_op.cc  |  26 +-
 paddle/fluid/operators/detection/nms_op.cc    |   1 +
 paddle/fluid/operators/detection/nms_op.cu    |   1 +
 paddle/fluid/operators/detection/nms_util.h   |   1 +
 paddle/fluid/operators/detection/poly_util.cc |   3 +-
 paddle/fluid/operators/detection/poly_util.h  |   1 +
 .../fluid/operators/detection/prior_box_op.h  |   1 +
 .../retinanet_detection_output_op.cc          |  34 +-
 .../detection/roi_perspective_transform_op.cc |  12 +-
 .../detection/roi_perspective_transform_op.cu |   7 +-
 .../detection/rpn_target_assign_op.cc         |   1 +
 .../detection/sigmoid_focal_loss_op.cc        |   1 +
 .../detection/sigmoid_focal_loss_op.h         |   1 +
 .../fluid/operators/detection/yolo_box_op.cc  |  20 +-
 .../operators/detection/yolov3_loss_op.cc     |   1 +
 paddle/fluid/operators/detection_map_op.cc    |   1 +
 paddle/fluid/operators/detection_map_op.h     |   1 +
 paddle/fluid/operators/determinant_op.cc      |   1 +
 paddle/fluid/operators/determinant_op.h       |   1 +
 paddle/fluid/operators/dgc_clip_by_norm_op.cc |   4 +-
 paddle/fluid/operators/dgc_op.cc              |   2 +
 paddle/fluid/operators/dgc_op.h               |  12 +-
 paddle/fluid/operators/diag_embed_op.cu       |   1 +
 paddle/fluid/operators/diag_embed_op.h        |   1 +
 paddle/fluid/operators/dirichlet_op.h         |   1 +
 paddle/fluid/operators/dist_op.cc             |   1 +
 .../fluid/operators/dlnne/dlnne_engine_op.h   |  12 +-
 .../operators/dlnne/dlnne_engine_op_test.cc   |   2 +
 paddle/fluid/operators/dropout_impl.cu.h      |   2 +
 paddle/fluid/operators/dropout_op.cc          |   1 +
 paddle/fluid/operators/dropout_op_xpu.cc      |   1 +
 paddle/fluid/operators/edit_distance_op.cc    |  13 +-
 paddle/fluid/operators/edit_distance_op.cu    |   1 +
 paddle/fluid/operators/edit_distance_op.h     |   1 +
 paddle/fluid/operators/eig_op.cc              |  11 +-
 paddle/fluid/operators/eig_op.h               |   2 +
 paddle/fluid/operators/eigvals_op.cc          |   1 +
 paddle/fluid/operators/eigvals_op.h           |  17 +-
 paddle/fluid/operators/einsum_op.cc           |   1 +
 .../elementwise/elementwise_add_op_xpu.cc     |   2 +-
 .../elementwise/elementwise_div_op.cc         |   1 +
 .../elementwise/elementwise_div_op.h          |   1 +
 .../elementwise/elementwise_heaviside_op.cc   |   1 +
 .../operators/elementwise/elementwise_mlu.h   |   1 +
 .../elementwise/elementwise_mod_op_xpu.cc     |   2 +-
 .../elementwise/elementwise_mul_op.cc         |   2 +
 .../elementwise/elementwise_mul_op.h          |   2 +-
 .../elementwise/elementwise_op_function.h     | 115 +--
 .../operators/elementwise/elementwise_xpu.h   |   6 +-
 .../test_elementwise_div_grad_grad.cc         |  12 +-
 .../test_elementwise_op_grad_grad.h           |   1 +
 paddle/fluid/operators/empty_op.cc            |   3 +-
 paddle/fluid/operators/expand_as_op.cc        |   1 +
 paddle/fluid/operators/expand_as_v2_op.cc     |   9 +-
 paddle/fluid/operators/expand_as_v2_op_npu.cc |   9 +-
 paddle/fluid/operators/expand_as_v2_op_xpu.cc |   9 +-
 paddle/fluid/operators/expand_op.cc           |   1 +
 paddle/fluid/operators/expand_op.h            |  13 +-
 paddle/fluid/operators/expand_v2_op.cc        |   1 +
 paddle/fluid/operators/expand_v2_op_npu.cc    |   2 +-
 paddle/fluid/operators/expand_v2_op_xpu.cc    |  11 +-
 paddle/fluid/operators/fake_dequantize_op.cc  |   2 +
 .../fluid/operators/fake_dequantize_op.cu.h   |   8 +-
 paddle/fluid/operators/fake_dequantize_op.h   |   1 +
 paddle/fluid/operators/fake_quantize_op.cc    |  10 +-
 paddle/fluid/operators/fake_quantize_op.cu.h  |  39 +-
 paddle/fluid/operators/fake_quantize_op.h     |   1 +
 paddle/fluid/operators/fc_op.cc               |   1 +
 paddle/fluid/operators/fc_op.h                |   1 +
 .../fluid/operators/fill_any_like_op_xpu.cc   |   1 -
 paddle/fluid/operators/fill_constant_op.cc    |   2 +
 .../fluid/operators/fill_constant_op_npu.cc   |   7 +-
 .../fluid/operators/fill_diagonal_tensor_op.h |   1 +
 paddle/fluid/operators/fill_op.cc             |   1 +
 paddle/fluid/operators/fill_op.h              |   2 +-
 paddle/fluid/operators/fill_zeros_like_op.cc  |   1 +
 .../fluid/operators/fill_zeros_like_op.cu.cc  |   1 +
 paddle/fluid/operators/filter_by_instag_op.cc |   1 +
 paddle/fluid/operators/filter_by_instag_op.cu |   8 +-
 paddle/fluid/operators/filter_by_instag_op.h  |   1 +
 paddle/fluid/operators/flatten_op.cc          |   2 +
 paddle/fluid/operators/flatten_op.h           |   1 +
 paddle/fluid/operators/flip_op.cc             |  13 +-
 paddle/fluid/operators/fold_op.h              |   1 +
 paddle/fluid/operators/frame_op.cc            |   9 +-
 paddle/fluid/operators/fsp_op.cc              |   1 +
 .../operators/fused/attention_layer_norm.h    |   9 +-
 .../fluid/operators/fused/attn_bias_add.cu.h  |  32 +-
 paddle/fluid/operators/fused/attn_gemm.h      |   6 +-
 .../fluid/operators/fused/conv_fusion_op.cc   |   1 +
 .../fluid/operators/fused/conv_fusion_op.cu   |   1 +
 .../operators/fused/cudnn_bn_add_relu_test.cc |  42 +-
 .../operators/fused/cudnn_fusion_helper.h     |   1 +
 .../operators/fused/cudnn_norm_conv_test.cc   |   7 +-
 paddle/fluid/operators/fused/fmha_ref.h       |   5 +-
 .../operators/fused/fused_attention_op.cc     |  14 +-
 .../operators/fused/fused_attention_op.cu     |  22 +-
 ...sed_bias_dropout_residual_layer_norm_op.cc |   1 +
 ...sed_bias_dropout_residual_layer_norm_op.cu |   2 +
 .../operators/fused/fused_bn_activation_op.cc |  68 +-
 .../operators/fused/fused_bn_activation_op.cu |  16 +-
 .../operators/fused/fused_bn_activation_op.h  |   1 +
 .../fused/fused_bn_add_activation_op.cc       |  32 +-
 .../fused/fused_bn_add_activation_op.cu       |   6 +-
 .../fused/fused_bn_add_activation_op.h        |   1 +
 .../operators/fused/fused_dropout_act_bias.h  |  44 +-
 .../operators/fused/fused_dropout_helper.h    |   6 +-
 .../fused/fused_elemwise_activation_op.cc     |   1 +
 .../fused/fused_elemwise_activation_op.h      |  16 +-
 .../fused_embedding_eltwise_layernorm_op.cc   |   1 +
 .../fused_embedding_eltwise_layernorm_op.cu   |   2 +
 .../fused/fused_embedding_fc_lstm_op.cc       |  41 +-
 .../fused/fused_embedding_seq_pool_op.cc      |   2 +
 .../fused_fc_elementwise_layernorm_op.cu      |  22 +-
 .../operators/fused/fused_feedforward_op.cc   |   1 +
 .../operators/fused/fused_feedforward_op.cu   |  32 +-
 .../fused/fused_gate_attention_op.cc          |   1 +
 .../fused/fused_gate_attention_op.cu          |   6 +-
 .../operators/fused/fused_gemm_epilogue_op.cc |   6 +-
 .../operators/fused/fused_gemm_epilogue_op.h  |   2 +
 .../fused_layernorm_residual_dropout_bias.h   |  42 +-
 .../fused/fused_multi_transformer_op.cc       |  14 +-
 .../fused/fused_multi_transformer_op.cu       |  17 +-
 .../fused/fused_residual_dropout_bias.h       |  42 +-
 .../operators/fused/fused_seqpool_cvm_op.cc   |   8 +-
 .../operators/fused/fused_seqpool_cvm_op.cu   |   1 +
 .../operators/fused/fused_seqpool_cvm_op.h    |   1 +
 .../operators/fused/fused_softmax_mask.cu.h   |   7 +-
 .../operators/fused/fused_transformer_op.cc   |   5 +-
 .../operators/fused/fused_transformer_op.h    |   4 +-
 .../fused/fusion_conv_inception_op.cc         |   6 +-
 .../operators/fused/fusion_group_op.cu.cc     |   1 +
 .../fluid/operators/fused/fusion_group_op.h   |   1 +
 paddle/fluid/operators/fused/fusion_gru_op.cc |   2 +
 .../fluid/operators/fused/fusion_lstm_op.cc   |   2 +
 .../fused/fusion_repeated_fc_relu_op.cc       |  11 +-
 .../fused/fusion_seqconv_eltadd_relu_op.cc    |   2 +
 .../fused/fusion_seqexpand_concat_fc_op.cc    |   7 +-
 .../fused/fusion_seqpool_concat_op.cc         |  20 +-
 .../fused/fusion_seqpool_cvm_concat_op.cc     |  25 +-
 .../fused/fusion_squared_mat_sub_op.cc        |   2 +
 .../fusion_transpose_flatten_concat_op.cc     |   2 +
 .../fusion_transpose_flatten_concat_op.cu.cc  |   1 +
 .../fusion_transpose_flatten_concat_op.h      |   1 +
 .../fused/mkldnn/multi_gru_mkldnn_op.cc       |   3 +-
 paddle/fluid/operators/fused/multi_gru_op.cc  |   1 +
 paddle/fluid/operators/fused/multi_gru_op.h   |   2 +-
 .../operators/fused/multihead_matmul_op.cc    |   1 +
 .../operators/fused/multihead_matmul_op.cu    |  14 +-
 .../fluid/operators/fused/resnet_unit_op.cc   |  33 +-
 .../operators/fused/skip_layernorm_op.cc      |   1 +
 .../operators/fused/skip_layernorm_op.cu      |   2 +
 .../fluid/operators/fused_softmax_mask_op.cc  |   1 +
 .../fluid/operators/fused_softmax_mask_op.cu  |   1 +
 .../fused_softmax_mask_upper_triangle_op.cc   |   1 +
 .../fused_softmax_mask_upper_triangle_op.cu   | 127 +--
 paddle/fluid/operators/gather_op.cc           |   8 +-
 paddle/fluid/operators/gather_op_xpu.cc       |   9 +-
 .../fluid/operators/gather_scatter_kernel.cu  |   9 +-
 paddle/fluid/operators/gather_test.cc         |   3 +-
 paddle/fluid/operators/gaussian_random_op.cu  |   1 +
 .../fluid/operators/gaussian_random_op_xpu.cc |   1 +
 paddle/fluid/operators/gelu_op.cc             |   1 +
 paddle/fluid/operators/gelu_op_xpu.cc         |   1 +
 .../fluid/operators/graph_khop_sampler_op.cc  |   9 +-
 .../fluid/operators/graph_khop_sampler_op.cu  |  99 +-
 .../fluid/operators/graph_khop_sampler_op.h   |   2 +
 paddle/fluid/operators/group_norm_op.cc       |   1 +
 paddle/fluid/operators/group_norm_op.cu       |  28 +-
 paddle/fluid/operators/group_norm_op.h        |   1 +
 paddle/fluid/operators/group_norm_op_npu.cc   |   3 +-
 paddle/fluid/operators/gru_op.cc              |   2 +
 paddle/fluid/operators/gru_op.h               |   1 +
 paddle/fluid/operators/gru_unit_op.cc         |   1 +
 paddle/fluid/operators/gru_unit_op.h          |   6 +-
 paddle/fluid/operators/hinge_loss_op.cc       |   1 +
 paddle/fluid/operators/huber_loss_op_xpu.cc   |   9 +-
 paddle/fluid/operators/im2sequence_op.cc      |   1 +
 paddle/fluid/operators/im2sequence_op.h       |   1 +
 paddle/fluid/operators/index_impl.cu.h        |  13 +-
 paddle/fluid/operators/index_sample_op.cc     |   4 +-
 paddle/fluid/operators/index_select_op.h      |   1 +
 paddle/fluid/operators/inplace_abn_op.cc      |  34 +-
 paddle/fluid/operators/inplace_abn_op.cu      |   5 +-
 paddle/fluid/operators/inplace_abn_op.h       |   1 +
 paddle/fluid/operators/instance_norm_op.cc    |  20 +-
 paddle/fluid/operators/instance_norm_op.h     |   1 +
 paddle/fluid/operators/interpolate_op.cc      |  13 +-
 paddle/fluid/operators/interpolate_op.cu      |  79 +-
 paddle/fluid/operators/interpolate_op.h       |  43 +-
 paddle/fluid/operators/interpolate_op_npu.cc  |   3 +-
 paddle/fluid/operators/interpolate_op_xpu.cc  |  18 +-
 paddle/fluid/operators/interpolate_v2_op.cc   |  27 +-
 .../fluid/operators/interpolate_v2_op_npu.cc  |   3 +-
 .../fluid/operators/interpolate_v2_op_xpu.cc  |  18 +-
 paddle/fluid/operators/inverse_op.cc          |   1 +
 paddle/fluid/operators/isfinite_op.cc         |  19 +-
 paddle/fluid/operators/isfinite_op.cu         |  10 +-
 paddle/fluid/operators/jit/benchmark.cc       |   5 +-
 paddle/fluid/operators/jit/gen/act.cc         |   5 +-
 paddle/fluid/operators/jit/gen/jitcode.h      |   1 +
 paddle/fluid/operators/jit/gen/matmul.cc      |  27 +-
 paddle/fluid/operators/jit/gen/matmul.h       |  10 +-
 paddle/fluid/operators/jit/gen/seqpool.cc     |  27 +-
 paddle/fluid/operators/jit/gen_base.cc        |   1 +
 paddle/fluid/operators/jit/gen_base.h         |   2 +-
 paddle/fluid/operators/jit/helper.cc          |  11 +-
 paddle/fluid/operators/jit/kernel_base.h      |   1 +
 paddle/fluid/operators/jit/kernel_key.cc      |   1 +
 .../jit/more/intrinsic/crf_decoding.cc        |   2 +
 .../jit/more/intrinsic/layer_norm.cc          |   2 +
 paddle/fluid/operators/jit/more/mix/mix.cc    |   1 +
 paddle/fluid/operators/jit/more/mkl/mkl.cc    |   1 +
 paddle/fluid/operators/jit/more/mkl/mkl.h     |  31 +-
 paddle/fluid/operators/jit/refer/refer.cc     |   1 +
 paddle/fluid/operators/jit/refer/refer.h      |  20 +-
 paddle/fluid/operators/jit/registry.h         |   1 +
 paddle/fluid/operators/jit/test.cc            | 244 ++---
 .../kernel_primitives/kernel_primitives.h     |   2 +-
 paddle/fluid/operators/kldiv_loss_op.cc       |   1 +
 paddle/fluid/operators/kldiv_loss_op_npu.cc   |   1 +
 paddle/fluid/operators/kthvalue_op.cc         |   1 +
 paddle/fluid/operators/l1_norm_op.cc          |   1 +
 paddle/fluid/operators/label_smooth_op.cc     |   1 +
 paddle/fluid/operators/layer_norm_kernel.cu.h | 144 +--
 paddle/fluid/operators/layer_norm_op.cc       |   1 +
 paddle/fluid/operators/layer_norm_op_xpu.cc   |   5 +-
 paddle/fluid/operators/layout_utils.h         |   1 +
 paddle/fluid/operators/linear_chain_crf_op.h  |   9 +-
 paddle/fluid/operators/linspace_op.cc         |   9 +-
 paddle/fluid/operators/lite/lite_engine_op.cc |   1 +
 paddle/fluid/operators/lite/lite_engine_op.h  |   3 +-
 .../operators/lite/lite_engine_op_test.cc     |   5 +-
 paddle/fluid/operators/load_combine_op.cc     |   4 +-
 paddle/fluid/operators/load_op.cc             |   4 +-
 paddle/fluid/operators/lod_reset_op.cc        |   1 +
 paddle/fluid/operators/lod_reset_op.h         |   1 +
 paddle/fluid/operators/log_loss_op.cc         |   1 +
 paddle/fluid/operators/log_loss_op_npu.cc     |   1 +
 paddle/fluid/operators/log_loss_op_xpu.cc     |   1 +
 paddle/fluid/operators/log_softmax_op.cc      |   1 +
 paddle/fluid/operators/lookup_table_op.cu     |  28 +-
 paddle/fluid/operators/lookup_table_v2_op.cc  |   1 +
 .../fluid/operators/lookup_table_v2_op_npu.cc |   1 +
 .../fluid/operators/lookup_table_v2_op_xpu.cc |   3 +-
 paddle/fluid/operators/lrn_op.cc              |  29 +-
 paddle/fluid/operators/lrn_op.h               |  28 +-
 paddle/fluid/operators/lstm_op.cc             |   1 +
 paddle/fluid/operators/lstm_op.h              |   8 +-
 paddle/fluid/operators/lstm_unit_op.cc        |   1 +
 paddle/fluid/operators/lstmp_op.cc            |   1 +
 paddle/fluid/operators/lstmp_op.h             |   8 +-
 paddle/fluid/operators/lstsq_op.cc            |   4 +-
 paddle/fluid/operators/lstsq_op.cu            |   1 +
 paddle/fluid/operators/lstsq_op.h             |   2 +
 paddle/fluid/operators/lu_op.cc               |   5 +-
 paddle/fluid/operators/lu_unpack_op.cc        |   5 +-
 .../operators/margin_cross_entropy_op.cu      |  39 +-
 paddle/fluid/operators/margin_rank_loss_op.cc |   2 +
 paddle/fluid/operators/marker_op.cu           |   4 +-
 .../fluid/operators/match_matrix_tensor_op.cc |   3 +-
 paddle/fluid/operators/math.h                 |   3 +-
 paddle/fluid/operators/math/beam_search.cu    |  17 +-
 paddle/fluid/operators/math/beam_search.h     |   1 +
 .../fluid/operators/math/beam_search_test.cc  |   1 +
 .../operators/math/bert_encoder_functor.cu    |  28 +-
 .../operators/math/bert_encoder_functor.h     |   2 +
 paddle/fluid/operators/math/bloomfilter.h     |   4 +-
 .../fluid/operators/math/concat_and_split.cu  |   1 -
 .../fluid/operators/math/concat_and_split.h   |   1 +
 paddle/fluid/operators/math/concat_test.cc    |  44 +-
 paddle/fluid/operators/math/cross_entropy.cc  |   1 +
 paddle/fluid/operators/math/cross_entropy.h   |   1 +
 .../operators/math/eigen_values_vectors.h     |   7 +-
 paddle/fluid/operators/math/gru_compute.cu    | 127 +--
 paddle/fluid/operators/math/im2col.cc         |  21 +-
 paddle/fluid/operators/math/im2col.cu         |  45 +-
 paddle/fluid/operators/math/im2col.h          |   1 +
 paddle/fluid/operators/math/im2col_cfo_cpu.h  |   1 +
 paddle/fluid/operators/math/im2col_test.cc    |   2 +
 paddle/fluid/operators/math/inclusive_scan.h  |  17 +-
 paddle/fluid/operators/math/math_function.cc  |   1 +
 paddle/fluid/operators/math/matrix_bit_code.h |   1 +
 paddle/fluid/operators/math/matrix_solve.cc   |   1 +
 .../fluid/operators/math/matrix_solve.cu.cc   |   1 +
 paddle/fluid/operators/math/matrix_solve.h    |   1 +
 paddle/fluid/operators/math/sample_prob.cu    |   1 +
 paddle/fluid/operators/math/sampler.cc        |   1 +
 .../operators/math/selected_rows_functor.cc   |   1 +
 .../operators/math/selected_rows_functor.cu   |  38 +-
 .../math/selected_rows_functor_test.cc        |   5 +-
 .../math/selected_rows_functor_test.cu.cc     |   1 +
 .../fluid/operators/math/sequence_padding.cc  |   1 +
 .../fluid/operators/math/sequence_padding.cu  |   1 +
 .../fluid/operators/math/sequence_padding.h   |  16 +-
 .../fluid/operators/math/sequence_pooling.cc  |   3 +-
 .../fluid/operators/math/sequence_pooling.cu  | 121 +--
 .../fluid/operators/math/sequence_pooling.h   |   1 +
 .../operators/math/sequence_pooling_test.cc   |   1 +
 paddle/fluid/operators/math/sequence_scale.cc |   1 +
 paddle/fluid/operators/math/sequence_scale.cu |  16 +-
 paddle/fluid/operators/math/softmax.cc        |   1 +
 paddle/fluid/operators/math/softmax_impl.h    |  91 +-
 paddle/fluid/operators/math/sparse_impl.cu.h  |   3 +-
 paddle/fluid/operators/math/tree2col.cc       |   1 +
 paddle/fluid/operators/math/tree2col.cu       |   1 +
 paddle/fluid/operators/math/tree2col.h        |   1 +
 paddle/fluid/operators/math/vol2col.cu        |   1 +
 paddle/fluid/operators/math/vol2col.h         |   1 +
 paddle/fluid/operators/math/vol2col_test.cc   |   1 +
 paddle/fluid/operators/matmul_op.cc           |  61 +-
 paddle/fluid/operators/matmul_op_xpu.cc       |  17 +-
 paddle/fluid/operators/matmul_v2_op.cc        |   1 +
 paddle/fluid/operators/matmul_v2_op.h         |   1 +
 paddle/fluid/operators/matmul_v2_op_xpu.cc    |   2 +-
 paddle/fluid/operators/matrix_power_op.cc     |   1 +
 paddle/fluid/operators/matrix_rank_op.cc      |   1 +
 paddle/fluid/operators/mean_iou_op.h          |   1 +
 paddle/fluid/operators/mean_op_xpu.cc         |   5 +-
 paddle/fluid/operators/merge_lod_tensor_op.cc |   1 -
 .../fluid/operators/merge_selected_rows_op.cc |   1 +
 .../fluid/operators/merge_selected_rows_op.h  |   1 +
 paddle/fluid/operators/meshgrid_op.cc         |   3 +-
 paddle/fluid/operators/miopen_lstm_cache.h    |   1 +
 paddle/fluid/operators/miopen_rnn_cache.h     |   1 +
 .../operators/mkldnn/activation_mkldnn_op.cc  |   4 +-
 paddle/fluid/operators/mkldnn/axpy_handler.cc |   5 +-
 .../operators/mkldnn/concat_mkldnn_op.cc      |   9 +-
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  |  18 +-
 .../operators/mkldnn/dequantize_mkldnn_op.cc  |   2 +-
 .../operators/mkldnn/expand_v2_mkldnn_op.cc   |   6 +-
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc |  16 +-
 .../operators/mkldnn/interpolate_mkldnn_op.cc |  11 +-
 .../operators/mkldnn/matmul_mkldnn_op.cc      |   6 +-
 .../fluid/operators/mkldnn/matmul_mkldnn_op.h |   2 +-
 .../operators/mkldnn/matmul_v2_mkldnn_op.cc   |  13 +-
 .../fluid/operators/mkldnn/pool_mkldnn_op.cc  |   4 +-
 .../operators/mkldnn/quantize_mkldnn_op.cc    |   2 +-
 .../operators/mkldnn/requantize_mkldnn_op.cc  |  10 +-
 .../operators/mkldnn/reshape_mkldnn_op.cc     |   2 +-
 .../fluid/operators/mkldnn/stack_mkldnn_op.cc |   8 +-
 .../fluid/operators/mkldnn/sum_mkldnn_op.cc   |   5 +-
 .../operators/mkldnn/test_mkldnn_caching.cc   |   6 +-
 .../mkldnn/test_mkldnn_op_inplace.cc          |   1 +
 .../operators/mkldnn/test_mkldnn_op_nhwc.cc   |   1 +
 .../operators/mkldnn/transpose_mkldnn_op.cc   |   2 +-
 paddle/fluid/operators/mlu/mlu_baseop.cc      |   1 +
 paddle/fluid/operators/mode_op.cc             |   3 +-
 .../fluid/operators/modified_huber_loss_op.cc |  10 +-
 .../fluid/operators/modified_huber_loss_op.cu |   1 +
 paddle/fluid/operators/mul_op.cc              |   1 +
 paddle/fluid/operators/mul_op_xpu.cc          |   1 +
 paddle/fluid/operators/multiplex_op.cc        |   1 -
 paddle/fluid/operators/nanmedian_op.cc        |   1 +
 .../fluid/operators/nccl/nccl_gpu_common.cc   |   2 +-
 paddle/fluid/operators/nccl/nccl_op.cu.cc     |   2 +-
 .../fluid/operators/nccl/nccl_op_test.cu.cc   |   1 +
 paddle/fluid/operators/nce_op.h               |   2 +
 paddle/fluid/operators/nll_loss_op.cc         |   1 +
 paddle/fluid/operators/norm_op.cc             |   1 +
 paddle/fluid/operators/norm_utils.cu.h        |  98 +-
 paddle/fluid/operators/norm_utils.h           |   1 +
 paddle/fluid/operators/number_count_op.cu     |  12 +-
 paddle/fluid/operators/one_hot_op.cc          |   1 +
 paddle/fluid/operators/one_hot_op_npu.cc      |   1 -
 paddle/fluid/operators/one_hot_v2_op.cc       |   1 +
 paddle/fluid/operators/one_hot_v2_op_npu.cc   |   1 -
 .../fluid/operators/optimizers/adagrad_op.cc  |   5 +-
 .../fluid/operators/optimizers/adam_op_npu.cc |  29 +-
 .../fluid/operators/optimizers/adam_op_xpu.cc |   5 +-
 paddle/fluid/operators/optimizers/adamw_op.cc |   3 +-
 .../operators/optimizers/adamw_op_xpu.cc      |   5 +-
 .../operators/optimizers/cast_with_ptr.h      |   6 +-
 .../operators/optimizers/dgc_momentum_op.cc   |   4 +-
 .../distributed_fused_lamb_init_op.cu         |  29 +-
 .../optimizers/distributed_fused_lamb_op.cu   |  99 +-
 paddle/fluid/operators/optimizers/dpsgd_op.h  |   7 +-
 paddle/fluid/operators/optimizers/ftrl_op.h   |   5 +-
 paddle/fluid/operators/optimizers/lamb_op.cc  |  21 +-
 paddle/fluid/operators/optimizers/lamb_op.cu  |   5 +-
 paddle/fluid/operators/optimizers/lamb_op.h   |  22 +-
 .../fluid/operators/optimizers/lamb_op_xpu.cc |   2 +-
 .../operators/optimizers/lars_momentum_op.cu  |  13 +-
 .../optimizers/merged_momentum_op_mlu.cc      |   2 +-
 .../optimizers/merged_momentum_op_npu.cc      |  10 +-
 .../fluid/operators/optimizers/momentum_op.cc |  43 +-
 .../fluid/operators/optimizers/momentum_op.h  |   1 +
 .../operators/optimizers/momentum_op_mlu.cc   |   7 +-
 .../operators/optimizers/momentum_op_npu.cc   |  15 +-
 .../operators/optimizers/momentum_op_xpu.cc   |   1 +
 .../operators/optimizers/multi_tensor_apply.h |  13 +-
 .../pow2_decay_with_linear_warmup_op.cc       |   1 +
 .../pow2_decay_with_linear_warmup_op.h        |  12 +-
 .../fluid/operators/optimizers/rmsprop_op.cc  |   3 +-
 .../operators/optimizers/rmsprop_op_xpu.cc    |   2 +
 paddle/fluid/operators/optimizers/sgd_op.cc   |  13 +-
 paddle/fluid/operators/optimizers/sgd_op.cu   |  11 +-
 .../fluid/operators/optimizers/sgd_op_xpu.cc  |   3 +-
 .../optimizers/sparse_momentum_op.cc          |   1 +
 .../operators/optimizers/sparse_momentum_op.h |   1 +
 paddle/fluid/operators/p_norm_op.cc           |  12 +-
 paddle/fluid/operators/pad2d_op.cc            |   1 +
 paddle/fluid/operators/pad2d_op.cu            |   1 +
 paddle/fluid/operators/pad3d_op.cc            |   1 +
 .../fluid/operators/pad_constant_like_op.cc   |   1 +
 paddle/fluid/operators/pad_constant_like_op.h |   1 +
 paddle/fluid/operators/pad_op.cc              |   1 +
 paddle/fluid/operators/partial_concat_op.cc   |   6 +-
 paddle/fluid/operators/partial_concat_op.cu   |   1 +
 paddle/fluid/operators/partial_concat_op.h    |   1 +
 paddle/fluid/operators/partial_sum_op.cc      |   6 +-
 paddle/fluid/operators/partial_sum_op.cu      |   1 +
 paddle/fluid/operators/partial_sum_op.h       |   1 +
 paddle/fluid/operators/pixel_shuffle_op.cc    |   1 +
 paddle/fluid/operators/poisson_op.cc          |   1 +
 paddle/fluid/operators/pool_op.cc             |   4 +-
 paddle/fluid/operators/pool_op_xpu.cc         |  12 +-
 paddle/fluid/operators/pool_with_index_op.cc  |   1 +
 .../operators/positive_negative_pair_op.cc    |  12 +-
 .../operators/positive_negative_pair_op.h     |   1 +
 paddle/fluid/operators/prelu_op.cc            |   1 +
 .../fluid/operators/prim_ops/prim_op_test.cc  |   1 -
 paddle/fluid/operators/print_op.cc            |  10 +-
 paddle/fluid/operators/prroi_pool_op.cc       |   1 +
 paddle/fluid/operators/prroi_pool_op.h        |   1 +
 .../operators/prune_gate_by_capacity_op.cu    |   7 +-
 .../pscore/distributed_lookup_table_op.cc     |   3 +-
 .../pscore/distributed_lookup_table_op.h      |   1 +
 .../pscore/distributed_push_sparse_op.cc      |   3 +-
 .../pscore/distributed_push_sparse_op.h       |   1 +
 .../pscore/heter_listen_and_serv_op.cc        |   6 +-
 .../pscore/heter_listen_and_serv_op.h         |   1 +
 .../pscore/heter_listen_and_server_test.cc    |   4 +-
 .../operators/pscore/heter_server_test.cc     |  19 +-
 .../pscore/send_and_recv_op_cpu_test.cc       |  10 +-
 .../pscore/send_and_recv_op_gpu_test.cc       |  11 +-
 .../operators/pull_box_extended_sparse_op.h   |   1 +
 paddle/fluid/operators/pull_gpups_sparse_op.h |   1 +
 paddle/fluid/operators/pull_sparse_op.cc      |   1 +
 paddle/fluid/operators/pull_sparse_op.h       |   1 +
 paddle/fluid/operators/pull_sparse_v2_op.cc   |   1 +
 paddle/fluid/operators/pull_sparse_v2_op.h    |   1 +
 paddle/fluid/operators/push_dense_op.cc       |   1 +
 paddle/fluid/operators/push_dense_op.h        |   1 +
 paddle/fluid/operators/py_func_op.cc          |   1 +
 paddle/fluid/operators/py_layer_op.cc         |   4 +-
 paddle/fluid/operators/py_layer_op.h          |   1 +
 paddle/fluid/operators/pyramid_hash_op.cc     |   7 +-
 paddle/fluid/operators/qr_op.cc               |   2 +
 paddle/fluid/operators/qr_op.cu               |   7 +-
 paddle/fluid/operators/qr_op.h                |  11 +-
 paddle/fluid/operators/quantize_linear_op.cc  |   2 +
 paddle/fluid/operators/quantize_linear_op.cu  |   9 +-
 paddle/fluid/operators/quantize_linear_op.h   |   1 +
 paddle/fluid/operators/quantize_op.cc         |  21 +-
 paddle/fluid/operators/quantize_op.h          |   1 +
 paddle/fluid/operators/queue_generator_op.cc  |   7 +-
 paddle/fluid/operators/random_crop_op.h       |   1 +
 paddle/fluid/operators/random_routing_op.cu   |   6 +-
 paddle/fluid/operators/randperm_op.cc         |  10 +-
 paddle/fluid/operators/randperm_op_npu.cc     |   2 +-
 paddle/fluid/operators/range_op.cc            |   2 +
 paddle/fluid/operators/range_op.h             |  11 +-
 paddle/fluid/operators/range_op_xpu.cc        |   2 +-
 paddle/fluid/operators/rank_attention_op.cc   |   2 +
 paddle/fluid/operators/rank_attention_op.cu   |   1 +
 .../fluid/operators/reader/blocking_queue.h   |   7 +-
 .../fluid/operators/reader/buffered_reader.cc |   1 +
 .../operators/reader/create_ctr_reader_op.cc  |   1 -
 paddle/fluid/operators/recurrent_op.cc        |  53 +-
 .../operators/reduce_ops/frobenius_norm_op.cc |   1 +
 .../operators/reduce_ops/logsumexp_op.cc      |   1 +
 .../operators/reduce_ops/reduce_amax_op.cc    |  10 +-
 .../reduce_ops/reduce_amax_op.part.cu         |   5 +-
 .../operators/reduce_ops/reduce_amin_op.cc    |  10 +-
 .../reduce_ops/reduce_amin_op.part.cu         |   5 +-
 .../operators/reduce_ops/reduce_max_op.cc     |   3 +-
 .../operators/reduce_ops/reduce_max_op_xpu.cc |   1 +
 .../operators/reduce_ops/reduce_mean_op.cc    |   1 +
 .../reduce_ops/reduce_mean_op_npu.cc          |   2 +-
 .../operators/reduce_ops/reduce_min_op.cc     |   3 +-
 .../fluid/operators/reduce_ops/reduce_op.cu.h |   1 -
 paddle/fluid/operators/reduce_ops/reduce_op.h |   6 +-
 .../operators/reduce_ops/reduce_op_function.h |   1 +
 .../operators/reduce_ops/reduce_op_mlu.h      |   1 +
 .../operators/reduce_ops/reduce_op_xpu.h      |   1 +
 .../operators/reduce_ops/reduce_sum_op_xpu.cc |   1 +
 paddle/fluid/operators/renorm_op.cu           |  11 +-
 .../fluid/operators/repeat_interleave_op.cc   |  12 +-
 .../fluid/operators/repeat_interleave_op.cu   |  62 +-
 paddle/fluid/operators/repeat_interleave_op.h |   4 +-
 paddle/fluid/operators/requantize_op.cc       |   1 +
 paddle/fluid/operators/requantize_op.h        |   1 +
 paddle/fluid/operators/rnn_op.cc              |   1 +
 paddle/fluid/operators/roi_align_op.cc        |   1 +
 paddle/fluid/operators/roi_align_op_xpu.cc    |   1 +
 paddle/fluid/operators/roi_pool_op.cc         |   1 +
 paddle/fluid/operators/row_conv_op.cc         |   2 +
 paddle/fluid/operators/row_conv_op.cu         |  29 +-
 paddle/fluid/operators/rrelu_op.cc            |   1 +
 paddle/fluid/operators/run_program_op.h       |  13 +-
 paddle/fluid/operators/sample_logits_op.cc    |   2 +
 paddle/fluid/operators/sample_logits_op.cu    |  29 +-
 paddle/fluid/operators/sample_logits_op.h     |   1 +
 paddle/fluid/operators/save_combine_op.cc     |   4 +-
 paddle/fluid/operators/save_combine_op.h      |   1 +
 .../operators/save_load_combine_op_test.cc    |   1 +
 paddle/fluid/operators/save_op.cc             |   5 +-
 paddle/fluid/operators/save_op.h              |   1 +
 paddle/fluid/operators/scale_op.cc            |   1 +
 paddle/fluid/operators/scale_op_xpu.cc        |   1 +
 paddle/fluid/operators/scatter_nd_add_op.cc   |   1 +
 paddle/fluid/operators/scatter_op.cc          |   1 +
 paddle/fluid/operators/scatter_op_xpu.cc      |  11 +-
 paddle/fluid/operators/seed_op.cc             |  17 +-
 paddle/fluid/operators/segment_pool_op.cc     |   1 +
 .../sequence_ops/sequence_concat_op.cc        |   1 +
 .../sequence_ops/sequence_concat_op.cu.cc     |   1 +
 .../sequence_ops/sequence_concat_op.h         |   3 +-
 .../operators/sequence_ops/sequence_conv_op.h |   1 +
 .../sequence_ops/sequence_conv_op_xpu.cc      |  30 +-
 .../sequence_ops/sequence_enumerate_op.cu     |   1 +
 .../sequence_ops/sequence_erase_op.cc         |   1 +
 .../sequence_ops/sequence_erase_op.cu         |   1 +
 .../sequence_ops/sequence_erase_op.h          |   1 +
 .../sequence_ops/sequence_expand_as_op.cc     |   1 +
 .../sequence_ops/sequence_expand_as_op.cu     |   1 +
 .../sequence_ops/sequence_expand_as_op.h      |   1 +
 .../sequence_ops/sequence_expand_op.cc        |  10 +-
 .../sequence_ops/sequence_expand_op.cu        |   1 +
 .../sequence_ops/sequence_mask_op.cc          |   1 +
 .../operators/sequence_ops/sequence_pad_op.cc |   1 +
 .../operators/sequence_ops/sequence_pad_op.h  |   1 +
 .../sequence_ops/sequence_pool_op.cc          |  12 +-
 .../operators/sequence_ops/sequence_pool_op.h |   8 +-
 .../sequence_ops/sequence_reshape_op.cc       |   2 +
 .../sequence_ops/sequence_reverse_op.h        |   1 +
 .../sequence_ops/sequence_scatter_op.cc       |   2 +
 .../sequence_ops/sequence_slice_op.cc         |   1 +
 .../sequence_ops/sequence_softmax_op.cc       |   1 +
 .../sequence_ops/sequence_softmax_op.cu       |  18 +-
 .../sequence_topk_avg_pooling_op.cc           |   6 +-
 .../sequence_topk_avg_pooling_op.h            |   1 +
 .../sequence_ops/sequence_unpad_op.cc         |   1 +
 .../sequence_ops/sequence_unpad_op.h          |   1 +
 paddle/fluid/operators/set_value_op.cc        |   1 -
 paddle/fluid/operators/set_value_op_npu.cc    |   1 -
 paddle/fluid/operators/shape_op.cc            |   1 +
 paddle/fluid/operators/shape_op_xpu.cc        |   1 +
 paddle/fluid/operators/share_buffer_op.h      |   5 +-
 paddle/fluid/operators/share_data_op.cc       |   6 +-
 .../fluid/operators/shrink_rnn_memory_op.cc   |   3 +-
 paddle/fluid/operators/shuffle_batch_op.cc    |   2 +
 paddle/fluid/operators/shuffle_batch_op.h     |   1 +
 paddle/fluid/operators/shuffle_channel_op.cc  |   6 +-
 paddle/fluid/operators/shuffle_channel_op.cu  |  16 +-
 paddle/fluid/operators/shuffle_channel_op.h   |   1 +
 .../sigmoid_cross_entropy_with_logits_op.cc   |   1 +
 paddle/fluid/operators/similarity_focus_op.h  |  19 +-
 paddle/fluid/operators/slice_op.cc            |   7 +-
 paddle/fluid/operators/slice_op.h             |   1 +
 paddle/fluid/operators/slice_op_mlu.cc        |   3 +-
 paddle/fluid/operators/slice_op_npu.cc        |   1 -
 paddle/fluid/operators/slice_op_xpu.cc        |   8 +-
 paddle/fluid/operators/smooth_l1_loss_op.cc   |   1 +
 .../fluid/operators/smooth_l1_loss_op_npu.cc  |   2 +-
 paddle/fluid/operators/softmax_op.cc          |   5 +-
 .../softmax_with_cross_entropy_op_xpu.cc      |  10 +-
 paddle/fluid/operators/solve_op.cc            |   2 +
 paddle/fluid/operators/solve_op.h             |   7 +-
 paddle/fluid/operators/space_to_depth_op.cc   |   5 +-
 paddle/fluid/operators/sparse_attention_op.cc |   1 +
 paddle/fluid/operators/sparse_attention_op.cu |  60 +-
 paddle/fluid/operators/spectral_norm_op.h     |   1 +
 paddle/fluid/operators/spectral_op.cc         |   1 +
 paddle/fluid/operators/spectral_op.h          |   1 +
 paddle/fluid/operators/split_op.cc            |   1 +
 paddle/fluid/operators/split_op.h             |   1 +
 paddle/fluid/operators/split_op_mlu.cc        |   2 +-
 paddle/fluid/operators/split_op_xpu.cc        |   3 +-
 paddle/fluid/operators/spp_op.cc              |   1 +
 paddle/fluid/operators/spp_op.h               |   1 +
 paddle/fluid/operators/stack_op.cc            |   1 +
 paddle/fluid/operators/stack_op_npu.cc        |  10 +-
 paddle/fluid/operators/stack_op_xpu.cc        |   1 +
 paddle/fluid/operators/stft_op.cc             |   1 +
 paddle/fluid/operators/stft_op.h              |   1 -
 .../fluid/operators/strided_slice_op_npu.cc   |  20 +-
 .../operators/string/faster_tokenizer_op.cc   |  11 +-
 .../operators/string/faster_tokenizer_op.h    |   5 +-
 paddle/fluid/operators/sum_op.cc              |  21 +-
 paddle/fluid/operators/sum_op.cu              |   5 +-
 paddle/fluid/operators/sum_op.h               |   1 +
 paddle/fluid/operators/sum_op_mlu.cc          |   2 +-
 paddle/fluid/operators/sum_op_xpu.cc          |   3 +-
 paddle/fluid/operators/svd_helper.h           |  25 +-
 paddle/fluid/operators/svd_op.cc              |   2 +
 paddle/fluid/operators/svd_op.cu              |   2 +
 paddle/fluid/operators/svd_op.h               |   1 +
 .../fluid/operators/sync_batch_norm_op.cu.h   | 102 +--
 .../fluid/operators/sync_batch_norm_op_npu.cc |   5 +-
 paddle/fluid/operators/tdm_child_op.cc        |   2 +
 paddle/fluid/operators/tdm_child_op.h         |   1 +
 paddle/fluid/operators/tdm_sampler_op.cc      |   2 +
 paddle/fluid/operators/tdm_sampler_op.h       |   1 +
 paddle/fluid/operators/temporal_shift_op.cc   |   3 +-
 paddle/fluid/operators/temporal_shift_op.cu   |  26 +-
 .../operators/tensor_array_to_tensor_op.cc    |  18 +-
 paddle/fluid/operators/tensor_formatter.cc    |   1 +
 paddle/fluid/operators/tensor_to_string.h     |   3 +-
 .../operators/tensorrt/tensorrt_engine_op.h   |  10 +-
 .../tensorrt/tensorrt_engine_op_test.cc       |   2 +
 paddle/fluid/operators/tile_op_npu.cc         |   9 +-
 paddle/fluid/operators/tile_op_xpu.cc         |   9 +-
 paddle/fluid/operators/top_k_function_cuda.h  |   1 +
 paddle/fluid/operators/top_k_op.cc            |   6 +-
 paddle/fluid/operators/top_k_op.cu            |   8 +-
 paddle/fluid/operators/top_k_op.h             |   1 +
 paddle/fluid/operators/top_k_op_mlu.cc        |   2 +-
 paddle/fluid/operators/top_k_v2_op_npu.cc     |   1 +
 paddle/fluid/operators/trace_op.cc            |  29 +-
 paddle/fluid/operators/transfer_layout_op.cc  |   8 +-
 paddle/fluid/operators/transpose_op.cc        |   1 +
 paddle/fluid/operators/transpose_op.cu.h      |  57 +-
 paddle/fluid/operators/transpose_op.h         |   1 +
 paddle/fluid/operators/transpose_op_mlu.cc    |   2 +-
 paddle/fluid/operators/transpose_op_xpu.cc    |   3 +-
 paddle/fluid/operators/tree_conv_op.h         |   1 +
 paddle/fluid/operators/tril_indices_op.cc     |   1 +
 paddle/fluid/operators/tril_triu_op.cc        |   2 +-
 .../operators/truncated_gaussian_random_op.cc |   3 +-
 .../truncated_gaussian_random_op_npu.cc       |   3 +-
 .../truncated_gaussian_random_op_xpu.cc       |   3 +-
 paddle/fluid/operators/unbind_op.cc           |   2 +
 paddle/fluid/operators/unbind_op.h            |   1 +
 paddle/fluid/operators/uniform_random_op.h    |   7 +-
 .../fluid/operators/uniform_random_op_mlu.cc  |   2 +-
 .../fluid/operators/uniform_random_op_xpu.cc  |   3 +-
 .../fluid/operators/unique_consecutive_op.cc  |   1 +
 .../fluid/operators/unique_consecutive_op.cu  |   2 +
 .../fluid/operators/unique_consecutive_op.h   |   1 +
 paddle/fluid/operators/unique_op.cc           |   2 +
 paddle/fluid/operators/unique_op.h            |   1 +
 .../fluid/operators/unique_with_counts_op.h   |   1 +
 paddle/fluid/operators/unpool_op.cc           |   1 +
 paddle/fluid/operators/unpool_op.h            |   1 +
 paddle/fluid/operators/unsqueeze_op.cc        |   7 +-
 paddle/fluid/operators/unsqueeze_op.h         |   8 +-
 paddle/fluid/operators/unstack_op.cc          |   1 +
 paddle/fluid/operators/utils.h                |   1 +
 paddle/fluid/operators/var_conv_2d_op.cc      |   2 +
 paddle/fluid/platform/aligned_vector.h        |  20 +-
 paddle/fluid/platform/bfloat16_test.cc        |   1 +
 paddle/fluid/platform/bfloat16_test.cu        |   2 +
 paddle/fluid/platform/collective_helper.cc    |   1 +
 paddle/fluid/platform/complex_test.cc         |   2 +
 paddle/fluid/platform/complex_test.cu         |   1 +
 paddle/fluid/platform/cpu_info.cc             |   1 +
 .../platform/cuda_graph_with_memory_pool.cc   |   1 +
 .../platform/cuda_graph_with_memory_pool.h    |  24 +-
 paddle/fluid/platform/denormal.cc             |   1 +
 .../platform/device/gpu/cuda/cuda_graph.cc    |   1 +
 .../platform/device/gpu/cuda/cuda_graph.h     |   2 +-
 .../platform/device/gpu/cuda/cuda_helper.h    |   2 +-
 .../device/gpu/cuda/cudnn_helper_test.cc      |  10 +-
 .../platform/device/gpu/cuda_helper_test.cu   |   6 +-
 .../platform/device/gpu/cudnn_desc_test.cc    |   4 +-
 paddle/fluid/platform/device/gpu/gpu_info.cc  |   6 +-
 paddle/fluid/platform/device/gpu/gpu_info.h   |   1 +
 .../platform/device/gpu/gpu_launch_config.h   |  26 +-
 .../platform/device/gpu/gpu_primitives.h      |   1 +
 .../platform/device/gpu/gpu_resource_pool.cc  |   1 +
 paddle/fluid/platform/device/gpu/gpu_types.h  |   2 +
 .../fluid/platform/device/gpu/nccl_helper.h   |   3 +-
 .../device/gpu/rocm/miopen_helper_test.cc     |   6 +-
 .../platform/device/gpu/rocm/rocm_helper.h    |   2 +-
 .../fluid/platform/device/ipu/ipu_device.cc   |   7 +-
 .../fluid/platform/device/ipu/ipu_executor.cc |   6 +-
 paddle/fluid/platform/device/ipu/ipu_info.h   |   1 +
 .../fluid/platform/device/ipu/ipu_strategy.h  |  10 +-
 .../popart_canonicalization/activation_ops.cc |  16 +-
 .../ipu/popart_canonicalization/math_ops.cc   |  15 +-
 .../ipu/popart_canonicalization/nn_ops.cc     |  45 +-
 .../ipu/popart_canonicalization/op_builder.cc |   5 +-
 .../ipu/popart_canonicalization/search_ops.cc |   8 +-
 .../ipu/popart_canonicalization/tensor_ops.cc | 113 +--
 .../fluid/platform/device/mlu/cncl_helper.h   |   2 +-
 .../platform/device/mlu/device_context.h      |   1 +
 .../device/mlu/device_context_test.cc         |  10 +-
 .../device/mlu/mlu_collective_helper.cc       |   1 +
 paddle/fluid/platform/device/mlu/mlu_info.cc  |   7 +-
 .../fluid/platform/device/mlu/mlu_stream.cc   |   1 +
 .../platform/device/npu/ascend_npu_info.cc    |   2 +
 .../fluid/platform/device/npu/dynload/hccl.h  |   1 +
 .../fluid/platform/device/npu/enforce_npu.h   |   3 +-
 .../fluid/platform/device/npu/hccl_helper.h   |   6 +-
 .../device/npu/npu_collective_helper.cc       |   1 +
 paddle/fluid/platform/device/npu/npu_info.cc  |   7 +-
 .../platform/device/npu/npu_op_runner.cc      |   1 -
 .../platform/device/npu/npu_resource_pool.cc  |   1 +
 .../fluid/platform/device/npu/npu_stream.cc   |   1 +
 .../fluid/platform/device/xpu/bkcl_helper.h   |   3 +-
 .../fluid/platform/device/xpu/enforce_xpu.h   |   1 -
 .../device/xpu/tests/enforce_xpu_test.cc      |   1 +
 paddle/fluid/platform/device/xpu/xpu_info.cc  |   3 +-
 paddle/fluid/platform/device/xpu/xpu_info.h   |   1 +
 .../fluid/platform/device/xpu/xpu_op_list.cc  |   3 +-
 paddle/fluid/platform/device_code.cc          |   4 +-
 paddle/fluid/platform/device_code_test.cc     |   2 +
 paddle/fluid/platform/device_context.cc       |   2 +
 paddle/fluid/platform/device_context.h        |   3 +-
 paddle/fluid/platform/device_context_test.cu  |  11 +-
 .../fluid/platform/device_context_xpu_test.cc |   7 +-
 paddle/fluid/platform/device_event.h          |   2 +-
 paddle/fluid/platform/device_event_base.cc    |   1 +
 paddle/fluid/platform/device_event_base.h     |   1 +
 paddle/fluid/platform/device_event_cpu.h      |   1 +
 paddle/fluid/platform/device_event_gpu.cc     |   2 +-
 paddle/fluid/platform/device_event_test.cc    |   5 +-
 paddle/fluid/platform/device_tracer.cc        |   7 +-
 paddle/fluid/platform/dynload/cublas.h        |   1 +
 paddle/fluid/platform/dynload/cublasLt.h      |   1 +
 paddle/fluid/platform/dynload/cuda_driver.cc  |   1 +
 paddle/fluid/platform/dynload/cuda_driver.h   |   1 +
 paddle/fluid/platform/dynload/cudnn.cc        |   1 +
 paddle/fluid/platform/dynload/cudnn.h         |   1 +
 paddle/fluid/platform/dynload/cufft.cc        |   1 +
 paddle/fluid/platform/dynload/cufft.h         |   1 +
 paddle/fluid/platform/dynload/cupti.h         |   1 +
 paddle/fluid/platform/dynload/curand.h        |   1 +
 paddle/fluid/platform/dynload/cusolver.h      |   1 +
 paddle/fluid/platform/dynload/cusparse.h      |   1 +
 .../fluid/platform/dynload/dynamic_loader.cc  |   1 +
 paddle/fluid/platform/dynload/hiprtc.cc       |   1 +
 paddle/fluid/platform/dynload/hiprtc.h        |   2 +
 paddle/fluid/platform/dynload/miopen.cc       |   1 +
 paddle/fluid/platform/dynload/miopen.h        |   3 +-
 paddle/fluid/platform/dynload/mklml.h         |   1 +
 paddle/fluid/platform/dynload/mklrt.h         |   1 +
 paddle/fluid/platform/dynload/nccl.h          |   1 +
 paddle/fluid/platform/dynload/nvjpeg.h        |   1 +
 paddle/fluid/platform/dynload/nvrtc.cc        |   1 +
 paddle/fluid/platform/dynload/nvrtc.h         |   1 +
 paddle/fluid/platform/dynload/nvtx.h          |   1 +
 paddle/fluid/platform/dynload/rccl.h          |   1 +
 paddle/fluid/platform/dynload/rocblas.h       |   1 +
 paddle/fluid/platform/dynload/rocm_driver.cc  |   1 +
 paddle/fluid/platform/dynload/rocm_driver.h   |   1 +
 paddle/fluid/platform/dynload/tensorrt.cc     |   1 +
 paddle/fluid/platform/enforce.h               |   3 +
 paddle/fluid/platform/enforce_test.cc         | 125 ++-
 paddle/fluid/platform/errors.h                |   4 +-
 paddle/fluid/platform/errors_test.cc          |   3 +-
 paddle/fluid/platform/fast_divmod.h           |   1 +
 paddle/fluid/platform/flags.h                 |   1 +
 paddle/fluid/platform/float16_test.cu         |   1 +
 paddle/fluid/platform/gen_comm_id_helper.cc   |   1 +
 paddle/fluid/platform/init_test.cc            |   1 +
 paddle/fluid/platform/lock_guard_ptr.h        |   1 +
 paddle/fluid/platform/mkldnn_reuse.h          |  18 +-
 paddle/fluid/platform/monitor.h               |   1 +
 paddle/fluid/platform/os_info.cc              |   1 +
 paddle/fluid/platform/os_info_test.cc         |  10 +-
 paddle/fluid/platform/profiler.cc             |   3 +-
 .../platform/profiler/chrometracing_logger.cc |  11 +-
 .../platform/profiler/chrometracing_logger.h  |   1 +
 paddle/fluid/platform/profiler/common_event.h |   1 +
 .../platform/profiler/cpu_utilization.cc      |  26 +-
 .../fluid/platform/profiler/cpu_utilization.h |   2 +
 paddle/fluid/platform/profiler/cuda_tracer.cc |   2 +
 paddle/fluid/platform/profiler/cuda_tracer.h  |   1 +
 .../platform/profiler/cupti_data_process.cc   |   2 +
 .../platform/profiler/cupti_data_process.h    |   1 +
 .../profiler/dump/deserialization_reader.cc   |   2 +
 .../profiler/dump/serialization_logger.cc     |   4 +-
 .../dump/test_serialization_logger.cc         |  15 +-
 paddle/fluid/platform/profiler/event_node.cc  |   1 +
 .../fluid/platform/profiler/event_python.cc   |   1 +
 .../fluid/platform/profiler/event_tracing.h   |  10 +-
 .../platform/profiler/host_event_recorder.h   |  11 +-
 paddle/fluid/platform/profiler/host_tracer.cc |   1 +
 .../profiler/mlu/cnpapi_data_process.cc       |   2 +
 .../fluid/platform/profiler/mlu/mlu_tracer.cc |   2 +
 paddle/fluid/platform/profiler/profiler.cc    |   1 +
 paddle/fluid/platform/profiler/profiler.h     |   1 +
 .../fluid/platform/profiler/profiler_test.cc  |   7 +-
 .../platform/profiler/test_event_node.cc      |  17 +-
 .../platform/profiler/trace_event_collector.h |   1 +
 paddle/fluid/platform/profiler/utils.h        |   6 +-
 paddle/fluid/platform/profiler_helper.h       |   4 +-
 paddle/fluid/platform/profiler_test.cc        |  20 +-
 paddle/fluid/platform/resource_pool.h         |   1 +
 paddle/fluid/platform/stream/cuda_stream.cc   |   1 +
 .../fluid/platform/stream_callback_manager.cc |   1 +
 paddle/fluid/platform/transform.h             |   1 +
 paddle/fluid/platform/transform_test.cu       |   5 +-
 paddle/fluid/pybind/ascend_wrapper_py.cc      |  44 +-
 paddle/fluid/pybind/bind_cost_model.cc        |   1 +
 paddle/fluid/pybind/bind_fleet_executor.cc    |  66 +-
 paddle/fluid/pybind/communication.cc          |  38 +-
 paddle/fluid/pybind/communicator_py.cc        |   5 +-
 paddle/fluid/pybind/compatible.cc             |  14 +-
 paddle/fluid/pybind/const_value.cc            |   1 +
 paddle/fluid/pybind/crypto.cc                 |  11 +-
 paddle/fluid/pybind/cuda_streams_py.cc        | 215 ++---
 paddle/fluid/pybind/data_set_py.cc            |   1 +
 paddle/fluid/pybind/distributed_py.cc         | 299 +++---
 paddle/fluid/pybind/eager.cc                  | 148 +--
 paddle/fluid/pybind/eager.h                   |   4 +-
 paddle/fluid/pybind/eager_custom_python_api.h |   1 +
 paddle/fluid/pybind/eager_functions.cc        |   5 +-
 paddle/fluid/pybind/eager_method.cc           |  29 +-
 .../pybind/eager_op_function_generator.cc     |   3 +-
 paddle/fluid/pybind/eager_py_layer.cc         |  35 +-
 paddle/fluid/pybind/eager_utils.cc            |   4 +
 paddle/fluid/pybind/eager_utils.h             |   7 +-
 paddle/fluid/pybind/exception.cc              |   1 +
 paddle/fluid/pybind/fleet_py.cc               |  17 +-
 paddle/fluid/pybind/fleet_wrapper_py.cc       |   6 +-
 paddle/fluid/pybind/generator_py.cc           |   3 +-
 paddle/fluid/pybind/gloo_context_py.cc        |  60 +-
 paddle/fluid/pybind/gloo_context_py.h         |   1 +
 paddle/fluid/pybind/imperative.cc             | 854 +++++++++---------
 paddle/fluid/pybind/imperative.h              |   1 +
 paddle/fluid/pybind/inference_api.cc          |  38 +-
 paddle/fluid/pybind/io.cc                     |   1 +
 paddle/fluid/pybind/io.h                      |   1 +
 paddle/fluid/pybind/ir.cc                     |  40 +-
 paddle/fluid/pybind/ir.h                      |   1 +
 paddle/fluid/pybind/op_function_common.cc     |   3 +-
 paddle/fluid/pybind/protobuf.cc               |  86 +-
 paddle/fluid/pybind/pybind.cc                 | 817 +++++++++--------
 paddle/fluid/pybind/reader_py.cc              | 165 ++--
 paddle/fluid/pybind/slice_utils.h             |   1 +
 paddle/fluid/pybind/tensor_py.h               |   2 +
 paddle/fluid/pybind/uva_utils.h               |   1 +
 paddle/fluid/string/pretty_log.h              |   2 +-
 paddle/infrt/api/infrt_api.cc                 |   2 +
 paddle/infrt/backends/host/phi_context.h      |   4 +-
 .../tensorrt/plugin/pool_op_plugin.cu         |   4 +-
 .../backends/tensorrt/plugin/pool_op_plugin.h |   8 +-
 .../backends/tensorrt/test_trt_engine.cc      |   4 +-
 paddle/infrt/backends/tensorrt/trt_engine.cc  |   1 +
 paddle/infrt/backends/tensorrt/trt_engine.h   |   1 +
 paddle/infrt/backends/tensorrt/trt_options.h  |   4 +-
 paddle/infrt/common/global.h                  |   1 +
 paddle/infrt/common/memory.h                  |   2 +-
 paddle/infrt/dialect/dense_tensor.h           |   3 +
 paddle/infrt/dialect/diagnostic_utils.cc      |   1 +
 .../infrt/dialect/infrt/ir/infrt_dialect.cc   |   2 +
 paddle/infrt/dialect/infrt/ir/infrt_dialect.h |   2 +-
 .../dialect/infrt/pass/infrt_op_fuse_pass.cc  |   1 +
 paddle/infrt/dialect/init_dialects.cc         |   2 -
 paddle/infrt/dialect/mlir_loader.cc           |   2 +-
 paddle/infrt/dialect/mlir_loader.h            |   2 +-
 paddle/infrt/dialect/opt.cc                   |   1 +
 .../infrt/dialect/pd/pass/pd_op_fuse_pass.cc  |   1 +
 .../infrt/dialect/phi/ir/infrt_phi_tensor.h   |   2 +
 paddle/infrt/dialect/phi/ir/phi_base.cc       |   1 +
 paddle/infrt/dialect/phi/ir/phi_base.h        |   2 +-
 paddle/infrt/dialect/phi/ir/phi_kernels.cc    |   2 +-
 paddle/infrt/dialect/phi/ir/phi_kernels.h     |   2 -
 .../infrt/dialect/phi/pass/kernel_op_desc.cc  |   2 +
 .../infrt/dialect/phi/pass/kernel_op_desc.h   |   1 +
 .../dialect/phi/pass/kernel_op_desc_test.cc   |   2 +
 .../dialect/phi/pass/phi_op_convert_pass.cc   |   1 +
 .../dialect/phi/pass/phi_op_convert_pass.h    |   1 +
 .../dialect/phi/pass/proto_arg_map_context.h  |   2 +
 paddle/infrt/dialect/phi/phi_exec.cc          |   4 +-
 paddle/infrt/dialect/print_ir.cc              |   5 +-
 paddle/infrt/dialect/tensor_shape.cc          |   1 -
 paddle/infrt/dialect/tensorrt/convert.h       |   1 +
 paddle/infrt/dialect/tensorrt/trt_exec.cc     |   5 +
 .../dialect/tensorrt/trt_graph_fuse_pass.cc   |   1 +
 .../dialect/tensorrt/trt_graph_split_pass.cc  |   1 +
 .../dialect/tensorrt/trt_op_teller_pass.cc    |   1 +
 paddle/infrt/dialect/tensorrt/trt_ops.cc      |   3 +
 paddle/infrt/dialect/tensorrt/trt_ops.h       |   1 +
 .../dialect/tensorrt/trt_type_convert_pass.cc |   1 +
 paddle/infrt/host_context/core_runtime.cc     |   3 +-
 paddle/infrt/host_context/core_runtime.h      |   2 +-
 paddle/infrt/host_context/kernel_registry.cc  |   8 +-
 paddle/infrt/host_context/mlir_exec.cc        |   1 +
 .../host_context/mlir_program_executor.h      |   2 +-
 .../host_context/mlir_to_runtime_translate.cc |   6 +-
 paddle/infrt/host_context/op_executable.cc    |   1 +
 paddle/infrt/host_context/op_executable.h     |   1 +
 paddle/infrt/host_context/paddle_mlir.h       |   1 +
 .../host_context/paddle_mlir_converter.cc     |   4 +-
 paddle/infrt/host_context/symbol_table.h      |   3 +-
 paddle/infrt/host_context/value.h             |   8 +-
 .../infrt/kernel/phi/dense_tensor_kernels.cc  |   2 +
 .../infershaped_kernel_launcher.cc            |   1 +
 .../phi/infershaped/infershaped_utils.h       |   1 +
 .../phi/infershaped/phi_kernel_launcher.h     |   1 +
 paddle/infrt/kernel/tensorrt/trt_kernels.cc   |   2 +
 paddle/infrt/kernel/tensorrt/trt_kernels.h    |   1 -
 paddle/infrt/kernel/test_kernels.cc           |  10 +-
 paddle/infrt/paddle/scope.h                   |   3 +-
 paddle/infrt/support/type_traits.h            |   3 +-
 paddle/infrt/tests/models/test_abs.cc         |   2 +
 paddle/phi/api/ext/op_meta_info.h             |  70 +-
 paddle/phi/api/lib/api_custom_impl.cc         |   3 +-
 paddle/phi/api/lib/backend_set.h              |   5 +-
 paddle/phi/api/lib/data_transform.cc          |   2 +
 paddle/phi/api/lib/sparse_api_custom_impl.cc  |   1 +
 paddle/phi/api/lib/tensor.cc                  |   2 +
 paddle/phi/api/lib/tensor_copy.cc             |   1 +
 paddle/phi/api/lib/tensor_method.cc           |   2 +
 paddle/phi/api/lib/utils/tensor_utils.h       |   1 -
 paddle/phi/backends/callback_manager.cc       |   5 +-
 paddle/phi/backends/custom/custom_context.h   |   1 +
 .../phi/backends/custom/custom_device_test.cc |   1 +
 paddle/phi/backends/device_base.cc            |   6 +-
 paddle/phi/backends/device_ext.h              |   4 +-
 paddle/phi/backends/device_manager.h          |   3 +-
 paddle/phi/backends/dynload/cublas.h          |   1 +
 paddle/phi/backends/dynload/cublasLt.h        |   1 +
 paddle/phi/backends/dynload/cuda_driver.h     |   1 +
 paddle/phi/backends/dynload/cudnn.cc          |   1 +
 paddle/phi/backends/dynload/cudnn.h           |   1 +
 paddle/phi/backends/dynload/cufft.cc          |   1 +
 paddle/phi/backends/dynload/cufft.h           |   1 +
 paddle/phi/backends/dynload/cupti.h           |   1 +
 paddle/phi/backends/dynload/curand.h          |   1 +
 paddle/phi/backends/dynload/cusolver.h        |   1 +
 paddle/phi/backends/dynload/cusparse.h        |   1 +
 paddle/phi/backends/dynload/hiprand.h         |   2 +-
 paddle/phi/backends/dynload/hiprtc.h          |   2 +
 paddle/phi/backends/dynload/lapack.cc         |   1 +
 paddle/phi/backends/dynload/lapack.h          |   1 +
 paddle/phi/backends/dynload/miopen.cc         |   1 +
 paddle/phi/backends/dynload/miopen.h          |   3 +-
 paddle/phi/backends/dynload/mklml.h           |   1 +
 paddle/phi/backends/dynload/mklrt.h           |   1 +
 paddle/phi/backends/dynload/nccl.h            |   1 +
 paddle/phi/backends/dynload/nvjpeg.h          |   1 +
 paddle/phi/backends/dynload/nvrtc.h           |   1 +
 paddle/phi/backends/dynload/nvtx.h            |   1 +
 paddle/phi/backends/dynload/port.h            |   2 +
 paddle/phi/backends/dynload/rccl.h            |   1 +
 paddle/phi/backends/dynload/rocblas.h         |   1 +
 paddle/phi/backends/dynload/rocm_driver.h     |   1 +
 paddle/phi/backends/dynload/tensorrt.cc       |   1 +
 paddle/phi/backends/event.cc                  |   1 +
 paddle/phi/backends/gpu/cuda/cuda_helper.h    |   2 +-
 paddle/phi/backends/gpu/gpu_context.cc        |   1 -
 paddle/phi/backends/gpu/gpu_context.h         |   1 +
 paddle/phi/backends/gpu/gpu_info.h            |   1 +
 paddle/phi/backends/gpu/gpu_launch_config.h   |   8 +-
 paddle/phi/backends/gpu/gpu_resources.h       |   1 +
 paddle/phi/backends/gpu/rocm/rocm_helper.h    |   2 +-
 paddle/phi/backends/gpu/rocm/rocm_info.cc     |   1 +
 paddle/phi/backends/stream.cc                 |   1 +
 paddle/phi/backends/xpu/enforce_xpu.h         |   3 +-
 paddle/phi/backends/xpu/xpu_context.cc        |   5 +-
 paddle/phi/backends/xpu/xpu_context.h         |   6 +-
 paddle/phi/backends/xpu/xpu_header.h          |   1 -
 paddle/phi/backends/xpu/xpu_info.h            |   1 +
 paddle/phi/common/data_type.h                 |   3 +-
 paddle/phi/common/int_array.cc                |   3 +-
 paddle/phi/common/place.cc                    |   1 -
 paddle/phi/common/scalar.cc                   |   5 +-
 paddle/phi/core/compat/op_utils.h             |   1 -
 paddle/phi/core/ddim.h                        |   8 +-
 paddle/phi/core/dense_tensor.h                |   2 +-
 paddle/phi/core/dense_tensor_impl.cc          |   7 +-
 paddle/phi/core/device_context.cc             |   1 +
 paddle/phi/core/device_context.h              |  32 +-
 paddle/phi/core/enforce.cc                    |   3 +-
 paddle/phi/core/hostdevice.h                  |   1 +
 paddle/phi/core/kernel_factory.cc             |   1 -
 paddle/phi/core/kernel_registry.h             | 222 +++--
 paddle/phi/core/kernel_utils.h                |   5 +-
 paddle/phi/core/meta_tensor.h                 |   3 +-
 paddle/phi/core/string_tensor.cc              |   1 +
 paddle/phi/core/tensor_base.cc                |   1 +
 paddle/phi/core/utils/intrusive_ptr.h         |   1 +
 paddle/phi/infermeta/binary.cc                |   1 +
 paddle/phi/infermeta/multiary.cc              |   2 +
 paddle/phi/infermeta/ternary.cc               |   1 +
 paddle/phi/kernels/assign_kernel.cc           |   3 +-
 paddle/phi/kernels/auc_kernel.h               |   1 +
 paddle/phi/kernels/autotune/auto_tune_base.h  |   1 +
 paddle/phi/kernels/autotune/auto_tune_test.cu |   5 +-
 paddle/phi/kernels/autotune/cache.cc          |   2 +
 paddle/phi/kernels/autotune/cache.h           |   1 +
 paddle/phi/kernels/autotune/cache_test.cc     |   3 +
 paddle/phi/kernels/autotune/gpu_timer_test.cu |   2 +
 paddle/phi/kernels/autotune/switch_autotune.h |   1 +
 paddle/phi/kernels/batch_norm_grad_kernel.h   |   1 +
 .../kernels/broadcast_tensors_grad_kernel.h   |   1 +
 paddle/phi/kernels/broadcast_tensors_kernel.h |   1 +
 .../phi/kernels/channel_shuffle_grad_kernel.h |   1 +
 paddle/phi/kernels/channel_shuffle_kernel.h   |   1 +
 paddle/phi/kernels/conv_kernel.cc             |   7 +-
 .../phi/kernels/conv_transpose_grad_kernel.h  |   1 +
 paddle/phi/kernels/conv_transpose_kernel.h    |   1 +
 paddle/phi/kernels/cpu/abs_kernel.cc          |   1 +
 paddle/phi/kernels/cpu/accuracy_kernel.cc     |   1 +
 paddle/phi/kernels/cpu/activation_kernel.cc   |   1 +
 paddle/phi/kernels/cpu/adagrad_kernel.cc      |   1 +
 paddle/phi/kernels/cpu/allclose_kernel.cc     |   1 +
 paddle/phi/kernels/cpu/arange_kernel.cc       |   1 +
 paddle/phi/kernels/cpu/atan2_grad_kernel.cc   |   3 +-
 paddle/phi/kernels/cpu/atan2_kernel.cc        |   3 +-
 .../phi/kernels/cpu/batch_norm_grad_kernel.cc |   3 +-
 paddle/phi/kernels/cpu/batch_norm_kernel.cc   |   4 +-
 .../phi/kernels/cpu/bce_loss_grad_kernel.cc   |   1 +
 paddle/phi/kernels/cpu/bce_loss_kernel.cc     |   1 +
 paddle/phi/kernels/cpu/bernoulli_kernel.cc    |   2 +
 .../bilinear_tensor_product_grad_kernel.cc    |   2 +-
 .../cpu/bilinear_tensor_product_kernel.cc     |   2 +-
 .../cpu/broadcast_tensors_grad_kernel.cc      |   1 +
 .../kernels/cpu/broadcast_tensors_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/cast_kernel.cc         |   2 +-
 .../cpu/channel_shuffle_grad_kernel.cc        |   2 +-
 .../phi/kernels/cpu/channel_shuffle_kernel.cc |   2 +-
 .../kernels/cpu/cholesky_solve_grad_kernel.cc |   3 +-
 .../phi/kernels/cpu/cholesky_solve_kernel.cc  |   3 +-
 paddle/phi/kernels/cpu/clip_grad_kernel.cc    |   1 +
 paddle/phi/kernels/cpu/clip_kernel.cc         |   1 +
 paddle/phi/kernels/cpu/compare_kernel.cc      |   2 +-
 paddle/phi/kernels/cpu/complex_grad_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/complex_kernel.cc      |   2 +-
 .../phi/kernels/cpu/conv_grad_grad_kernel.cc  |   2 +-
 paddle/phi/kernels/cpu/conv_grad_kernel.cc    |   2 +-
 paddle/phi/kernels/cpu/conv_kernel.cc         |   2 +-
 .../kernels/cpu/conv_transpose_grad_kernel.cc |   2 +-
 .../phi/kernels/cpu/conv_transpose_kernel.cc  |   2 +-
 .../phi/kernels/cpu/cross_entropy_kernel.cc   |   3 +-
 paddle/phi/kernels/cpu/cross_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/cross_kernel.cc        |   2 +-
 paddle/phi/kernels/cpu/cumprod_kernel.cc      |   1 +
 .../cpu/deformable_conv_grad_kernel.cc        |  40 +-
 paddle/phi/kernels/cpu/diag_grad_kernel.cc    |   1 +
 .../phi/kernels/cpu/diagonal_grad_kernel.cc   |   1 +
 paddle/phi/kernels/cpu/diagonal_kernel.cc     |   1 +
 paddle/phi/kernels/cpu/digamma_grad_kernel.cc |   1 +
 paddle/phi/kernels/cpu/digamma_kernel.cc      |   1 +
 paddle/phi/kernels/cpu/dist_grad_kernel.cc    |   2 +-
 paddle/phi/kernels/cpu/dist_kernel.cc         |   2 +-
 paddle/phi/kernels/cpu/dot_grad_kernel.cc     |   5 +-
 paddle/phi/kernels/cpu/dropout_grad_kernel.cc |   1 +
 paddle/phi/kernels/cpu/dropout_kernel.cc      |   1 +
 paddle/phi/kernels/cpu/eigh_grad_kernel.cc    |   2 +-
 paddle/phi/kernels/cpu/eigh_kernel.cc         |   2 +-
 paddle/phi/kernels/cpu/einsum_kernel.cc       |   1 +
 paddle/phi/kernels/cpu/elementwise.h          |   3 +-
 .../phi/kernels/cpu/elementwise_add_kernel.cc |   2 +-
 .../kernels/cpu/elementwise_divide_kernel.cc  |   2 +-
 paddle/phi/kernels/cpu/elementwise_kernel.cc  |   2 +-
 .../cpu/elementwise_multiply_kernel.cc        |   2 +-
 .../cpu/elementwise_subtract_kernel.cc        |   2 +-
 .../phi/kernels/cpu/embedding_grad_kernel.cc  |   2 +-
 paddle/phi/kernels/cpu/embedding_kernel.cc    |   2 +-
 paddle/phi/kernels/cpu/erf_grad_kernel.cc     |   1 +
 paddle/phi/kernels/cpu/erf_kernel.cc          |   1 +
 paddle/phi/kernels/cpu/erfinv_grad_kernel.cc  |   2 +-
 paddle/phi/kernels/cpu/erfinv_kernel.cc       |   2 +-
 .../phi/kernels/cpu/expand_as_grad_kernel.cc  |   2 +-
 paddle/phi/kernels/cpu/expand_as_kernel.cc    |   2 +-
 paddle/phi/kernels/cpu/expand_grad_kernel.cc  |   1 +
 paddle/phi/kernels/cpu/expand_kernel.cc       |   1 +
 paddle/phi/kernels/cpu/eye_kernel.cc          |   2 +-
 .../kernels/cpu/frobenius_norm_grad_kernel.cc |   2 +-
 .../phi/kernels/cpu/frobenius_norm_kernel.cc  |   2 +-
 paddle/phi/kernels/cpu/full_kernel.cc         |   1 -
 .../phi/kernels/cpu/gather_nd_grad_kernel.cc  |   1 +
 paddle/phi/kernels/cpu/gather_nd_kernel.cc    |   1 +
 paddle/phi/kernels/cpu/gather_tree_kernel.cc  |   1 +
 .../phi/kernels/cpu/gaussian_random_kernel.cc |   3 +-
 paddle/phi/kernels/cpu/gelu_kernel.cc         |   2 +
 .../phi/kernels/cpu/graph_reindex_kernel.cc   |   4 +-
 .../cpu/graph_sample_neighbors_kernel.cc      |   4 +-
 .../cpu/graph_send_recv_grad_kernel.cc        |   2 +-
 .../phi/kernels/cpu/graph_send_recv_kernel.cc |   2 +-
 .../kernels/cpu/grid_sample_grad_kernel.cc    |   5 +-
 .../kernels/cpu/gumbel_softmax_grad_kernel.cc |   2 +-
 .../phi/kernels/cpu/gumbel_softmax_kernel.cc  |   2 +-
 paddle/phi/kernels/cpu/histogram_kernel.cc    |   1 +
 .../phi/kernels/cpu/huber_loss_grad_kernel.cc |   1 +
 paddle/phi/kernels/cpu/huber_loss_kernel.cc   |   1 +
 .../kernels/cpu/index_sample_grad_kernel.cc   |   1 +
 paddle/phi/kernels/cpu/index_sample_kernel.cc |   2 +
 .../kernels/cpu/instance_norm_grad_kernel.cc  |  12 +-
 .../phi/kernels/cpu/instance_norm_kernel.cc   |   1 +
 .../kernels/cpu/interpolate_grad_kernel.cc    |   1 +
 paddle/phi/kernels/cpu/isclose_kernel.cc      |   1 +
 .../phi/kernels/cpu/kldiv_loss_grad_kernel.cc |   1 +
 paddle/phi/kernels/cpu/kldiv_loss_kernel.cc   |   1 +
 .../kernels/cpu/label_smooth_grad_kernel.cc   |   1 +
 paddle/phi/kernels/cpu/label_smooth_kernel.cc |   1 +
 .../phi/kernels/cpu/layer_norm_grad_kernel.cc |   1 +
 paddle/phi/kernels/cpu/layer_norm_kernel.cc   |   1 +
 paddle/phi/kernels/cpu/lerp_grad_kernel.cc    |   1 +
 paddle/phi/kernels/cpu/lerp_kernel.cc         |   1 +
 paddle/phi/kernels/cpu/lgamma_grad_kernel.cc  |   1 +
 paddle/phi/kernels/cpu/lgamma_kernel.cc       |   1 +
 .../kernels/cpu/log_softmax_grad_kernel.cc    |   7 +-
 paddle/phi/kernels/cpu/log_softmax_kernel.cc  |  35 +-
 paddle/phi/kernels/cpu/logsumexp_kernel.cc    |   1 -
 paddle/phi/kernels/cpu/matmul_grad_kernel.cc  |   1 -
 paddle/phi/kernels/cpu/matmul_kernel.cc       |   3 +-
 .../kernels/cpu/matrix_power_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/matrix_power_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/matrix_rank_kernel.cc  |   2 +-
 .../phi/kernels/cpu/matrix_rank_tol_kernel.cc |   1 +
 paddle/phi/kernels/cpu/maxout_grad_kernel.cc  |   3 +-
 paddle/phi/kernels/cpu/maxout_kernel.cc       |   3 +-
 .../phi/kernels/cpu/meshgrid_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/meshgrid_kernel.cc     |   2 +-
 paddle/phi/kernels/cpu/momentum_kernel.cc     |   1 +
 .../phi/kernels/cpu/multi_dot_grad_kernel.cc  |   2 +-
 paddle/phi/kernels/cpu/multi_dot_kernel.cc    |   2 +-
 .../phi/kernels/cpu/multiplex_grad_kernel.cc  |   1 -
 paddle/phi/kernels/cpu/mv_kernel.cc           |   1 -
 .../phi/kernels/cpu/nll_loss_grad_kernel.cc   |   1 +
 paddle/phi/kernels/cpu/nll_loss_kernel.cc     |   1 +
 paddle/phi/kernels/cpu/norm_grad_kernel.cc    |   8 +-
 paddle/phi/kernels/cpu/norm_kernel.cc         |   1 +
 paddle/phi/kernels/cpu/one_hot_kernel.cc      |   1 +
 paddle/phi/kernels/cpu/p_norm_grad_kernel.cc  |   1 +
 paddle/phi/kernels/cpu/p_norm_kernel.cc       |   1 +
 .../kernels/cpu/pixel_shuffle_grad_kernel.cc  |   2 +-
 .../phi/kernels/cpu/pixel_shuffle_kernel.cc   |   2 +-
 .../cpu/pixel_unshuffle_grad_kernel.cc        |   2 +-
 .../phi/kernels/cpu/pixel_unshuffle_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/poisson_kernel.cc      |   3 +-
 paddle/phi/kernels/cpu/pool_grad_kernel.cc    |   3 +-
 paddle/phi/kernels/cpu/pool_kernel.cc         |   3 +-
 .../phi/kernels/cpu/psroi_pool_grad_kernel.cc |   1 +
 paddle/phi/kernels/cpu/psroi_pool_kernel.cc   |   1 +
 paddle/phi/kernels/cpu/qr_kernel.cc           |   4 +-
 paddle/phi/kernels/cpu/reduce.h               |   3 +-
 .../phi/kernels/cpu/reduce_sum_grad_kernel.cc |   1 -
 paddle/phi/kernels/cpu/rmsprop_kernel.cc      |   1 +
 paddle/phi/kernels/cpu/rnn_functor.h          |  14 +-
 paddle/phi/kernels/cpu/rnn_grad_kernel.cc     |   7 +-
 paddle/phi/kernels/cpu/rnn_kernel.cc          |   3 +-
 paddle/phi/kernels/cpu/roi_align_kernel.cc    |  12 +-
 paddle/phi/kernels/cpu/scatter_grad_kernel.cc |   1 +
 paddle/phi/kernels/cpu/scatter_kernel.cc      |   1 +
 .../kernels/cpu/scatter_nd_add_grad_kernel.cc |   1 +
 .../phi/kernels/cpu/scatter_nd_add_kernel.cc  |   1 +
 .../kernels/cpu/segment_pool_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/segment_pool_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/selu_grad_kernel.cc    |   1 +
 paddle/phi/kernels/cpu/sgd_kernel.cc          |   1 +
 paddle/phi/kernels/cpu/sign_kernel.cc         |   2 +-
 paddle/phi/kernels/cpu/size_kernel.cc         |   2 +-
 paddle/phi/kernels/cpu/slice_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/slice_kernel.cc        |   2 +-
 .../sparse_weight_embedding_grad_kernel.cc    |   2 +-
 .../cpu/sparse_weight_embedding_kernel.cc     |   5 +-
 paddle/phi/kernels/cpu/split_kernel.cc        |   1 -
 .../kernels/cpu/temporal_shift_grad_kernel.cc |   1 +
 .../phi/kernels/cpu/temporal_shift_kernel.cc  |   1 +
 .../phi/kernels/cpu/transpose_grad_kernel.cc  |   1 +
 .../phi/kernels/cpu/tril_triu_grad_kernel.cc  |   3 +-
 paddle/phi/kernels/cpu/tril_triu_kernel.cc    |   3 +-
 paddle/phi/kernels/cpu/trunc_grad_kernel.cc   |   1 +
 paddle/phi/kernels/cpu/trunc_kernel.cc        |   3 +-
 paddle/phi/kernels/cpu/unfold_grad_kernel.cc  |   1 +
 paddle/phi/kernels/cpu/unfold_kernel.cc       |   1 +
 .../phi/kernels/cpu/uniform_random_kernel.cc  |   1 +
 paddle/phi/kernels/cpu/unique_kernel.cc       |   1 +
 paddle/phi/kernels/cpu/unstack_grad_kernel.cc |   1 +
 paddle/phi/kernels/cpu/unstack_kernel.cc      |   1 +
 .../phi/kernels/cpu/viterbi_decode_kernel.cc  |   6 +-
 paddle/phi/kernels/cpu/warpctc_grad_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/warpctc_kernel.cc      |   2 +-
 paddle/phi/kernels/cpu/yolo_box_kernel.cc     |   1 +
 .../kernels/cpu/yolov3_loss_grad_kernel.cc    |   4 +-
 paddle/phi/kernels/cpu/yolov3_loss_kernel.cc  |   4 +-
 paddle/phi/kernels/cumprod_grad_kernel.h      |   2 +-
 paddle/phi/kernels/cumprod_kernel.h           |   2 +-
 paddle/phi/kernels/diagonal_kernel.h          |   2 +-
 paddle/phi/kernels/digamma_grad_kernel.h      |   2 +-
 paddle/phi/kernels/digamma_kernel.h           |   2 +-
 paddle/phi/kernels/empty_kernel.cc            |   3 +-
 paddle/phi/kernels/expand_kernel.h            |   2 +-
 paddle/phi/kernels/flatten_grad_kernel.cc     |   1 +
 paddle/phi/kernels/flatten_kernel.cc          |   1 +
 .../phi/kernels/frobenius_norm_grad_kernel.h  |   1 +
 paddle/phi/kernels/frobenius_norm_kernel.h    |   1 +
 paddle/phi/kernels/full_kernel.h              |   1 -
 paddle/phi/kernels/funcs/activation_functor.h |  37 +-
 paddle/phi/kernels/funcs/adam_functors.h      |   6 +-
 paddle/phi/kernels/funcs/aligned_vector.h     |  21 +-
 paddle/phi/kernels/funcs/blas/blas_impl.cu.h  |   5 +-
 paddle/phi/kernels/funcs/broadcast_function.h |  32 +-
 .../kernels/funcs/concat_and_split_functor.cu |   3 +-
 .../kernels/funcs/deformable_conv_functor.cc  |  10 +-
 .../kernels/funcs/deformable_conv_functor.cu  |  57 +-
 .../funcs/detail/activation_functions.h       |   2 +
 paddle/phi/kernels/funcs/detail/avx_mathfun.h |  10 +-
 .../phi/kernels/funcs/detail/gru_cpu_kernel.h |   1 +
 .../phi/kernels/funcs/detail/gru_gpu_kernel.h |   1 +
 paddle/phi/kernels/funcs/detail/gru_kernel.h  |   1 +
 .../kernels/funcs/detail/lstm_cpu_kernel.h    |   1 +
 .../kernels/funcs/detail/lstm_gpu_kernel.h    |  76 +-
 paddle/phi/kernels/funcs/detail/lstm_kernel.h |   1 +
 paddle/phi/kernels/funcs/diagonal.h           |   1 +
 .../phi/kernels/funcs/distribution_helper.h   |   7 +-
 paddle/phi/kernels/funcs/eigen/extensions.h   |   1 -
 paddle/phi/kernels/funcs/elementwise_base.h   |  21 +-
 .../phi/kernels/funcs/elementwise_functor.h   |   1 +
 .../phi/kernels/funcs/elementwise_grad_base.h | 227 +++--
 paddle/phi/kernels/funcs/fc_functor.cc        |  17 +-
 paddle/phi/kernels/funcs/fc_functor.cu        |  13 +-
 paddle/phi/kernels/funcs/fc_functor.h         |   1 +
 paddle/phi/kernels/funcs/gather.cu.h          |  43 +-
 paddle/phi/kernels/funcs/gather.h             |   1 +
 paddle/phi/kernels/funcs/gru_compute.cu       |  73 +-
 paddle/phi/kernels/funcs/inclusive_scan.h     |  29 +-
 paddle/phi/kernels/funcs/index_impl.cu.h      |  12 +-
 .../kernels/funcs/lapack/lapack_function.cc   |   1 +
 paddle/phi/kernels/funcs/math_function.cc     |   1 +
 paddle/phi/kernels/funcs/math_function.cu     |  17 +-
 paddle/phi/kernels/funcs/math_function_impl.h |   1 +
 paddle/phi/kernels/funcs/matrix_inverse.cu.cc |   3 +-
 paddle/phi/kernels/funcs/matrix_inverse.h     |   2 +-
 paddle/phi/kernels/funcs/padding.h            |   1 +
 paddle/phi/kernels/funcs/pooling.cc           |  79 +-
 paddle/phi/kernels/funcs/pooling.cu           | 504 +++++------
 paddle/phi/kernels/funcs/pooling.h            |   1 +
 paddle/phi/kernels/funcs/reduce_function.h    | 171 ++--
 paddle/phi/kernels/funcs/scatter.cu.h         |  17 +-
 paddle/phi/kernels/funcs/scatter.h            |   2 +-
 paddle/phi/kernels/funcs/segment_pooling.cu   | 125 ++-
 paddle/phi/kernels/funcs/segment_pooling.h    |   1 +
 paddle/phi/kernels/funcs/select_impl.cu.h     |  67 +-
 paddle/phi/kernels/funcs/sequence2batch.h     |   1 +
 paddle/phi/kernels/funcs/slice_utils.h        |   1 +
 .../kernels/funcs/sparse/flatten_indices.h    |   1 +
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  |   2 +-
 paddle/phi/kernels/gpu/abs_kernel.cu          |   1 +
 paddle/phi/kernels/gpu/accuracy_kernel.cu     |  21 +-
 .../phi/kernels/gpu/activation_grad_kernel.cu |   6 +-
 paddle/phi/kernels/gpu/activation_kernel.cu   |   6 +-
 paddle/phi/kernels/gpu/adadelta_kernel.cu     |   3 +-
 paddle/phi/kernels/gpu/adagrad_kernel.cu      |  28 +-
 paddle/phi/kernels/gpu/adam_kernel.cu         |   4 +-
 paddle/phi/kernels/gpu/adamax_kernel.cu       |   3 +-
 paddle/phi/kernels/gpu/adamw_kernel.cu        |   4 +-
 paddle/phi/kernels/gpu/add_n_kernel.cu        |   8 +-
 paddle/phi/kernels/gpu/addmm_grad_kernel.cu   |   3 +-
 paddle/phi/kernels/gpu/addmm_kernel.cu        |   3 +-
 paddle/phi/kernels/gpu/allclose_kernel.cu     |   3 +-
 paddle/phi/kernels/gpu/arange_kernel.cu       |   3 +-
 paddle/phi/kernels/gpu/arg_min_max_kernel.cu  |  46 +-
 paddle/phi/kernels/gpu/argsort_grad_kernel.cu |   4 +-
 paddle/phi/kernels/gpu/argsort_kernel.cu      |   4 +-
 paddle/phi/kernels/gpu/atan2_grad_kernel.cu   |   3 +-
 paddle/phi/kernels/gpu/atan2_kernel.cu        |   3 +-
 paddle/phi/kernels/gpu/auc_kernel.cu          |   3 +-
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 272 +++---
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   | 127 ++-
 .../phi/kernels/gpu/bce_loss_grad_kernel.cu   |   3 +-
 paddle/phi/kernels/gpu/bce_loss_kernel.cu     |   3 +-
 .../bilinear_tensor_product_grad_kernel.cu    |   3 +-
 .../gpu/bilinear_tensor_product_kernel.cu     |   3 +-
 paddle/phi/kernels/gpu/bincount_kernel.cu     |  27 +-
 .../gpu/broadcast_tensors_grad_kernel.cu      |   4 +-
 .../kernels/gpu/broadcast_tensors_kernel.cu   |   5 +-
 paddle/phi/kernels/gpu/cast_grad_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/cast_kernel.cu         |   3 +-
 .../gpu/channel_shuffle_grad_kernel.cu        |   5 +-
 .../phi/kernels/gpu/channel_shuffle_kernel.cu |   5 +-
 .../phi/kernels/gpu/cholesky_grad_kernel.cu   |   3 +-
 paddle/phi/kernels/gpu/cholesky_kernel.cu     |   5 +-
 .../kernels/gpu/cholesky_solve_grad_kernel.cu |   3 +-
 .../phi/kernels/gpu/cholesky_solve_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/clip_grad_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/clip_kernel.cu         |   3 +-
 paddle/phi/kernels/gpu/complex_grad_kernel.cu |   5 +-
 paddle/phi/kernels/gpu/complex_kernel.cu      |   5 +-
 paddle/phi/kernels/gpu/concat_grad_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/concat_kernel.cu       |   3 +-
 .../phi/kernels/gpu/conv_grad_grad_kernel.cu  |   5 +-
 paddle/phi/kernels/gpu/conv_grad_kernel.cu    |   5 +-
 paddle/phi/kernels/gpu/conv_kernel.cu         |   5 +-
 .../kernels/gpu/conv_transpose_grad_kernel.cu |   5 +-
 .../phi/kernels/gpu/conv_transpose_kernel.cu  |   5 +-
 paddle/phi/kernels/gpu/copy_kernel.cu         |   3 +-
 .../kernels/gpu/cross_entropy_grad_kernel.cu  |  49 +-
 .../phi/kernels/gpu/cross_entropy_kernel.cu   | 121 ++-
 paddle/phi/kernels/gpu/cross_grad_kernel.cu   |   5 +-
 paddle/phi/kernels/gpu/cross_kernel.cu        |   5 +-
 paddle/phi/kernels/gpu/cumprod_grad_kernel.cu |   4 +-
 paddle/phi/kernels/gpu/cumprod_kernel.cu      |   3 +-
 paddle/phi/kernels/gpu/cumsum_kernel.cu       |  18 +-
 .../gpu/deformable_conv_grad_kernel.cu        | 145 ++-
 .../phi/kernels/gpu/deformable_conv_kernel.cu |   3 +-
 paddle/phi/kernels/gpu/depthwise_conv.h       | 361 ++++----
 .../phi/kernels/gpu/depthwise_conv_kernel.cu  |   7 +-
 .../kernels/gpu/determinant_grad_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/determinant_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/diag_grad_kernel.cu    |  23 +-
 paddle/phi/kernels/gpu/diag_kernel.cu         |   3 +-
 paddle/phi/kernels/gpu/dist_grad_kernel.cu    |   5 +-
 paddle/phi/kernels/gpu/dist_kernel.cu         |   5 +-
 paddle/phi/kernels/gpu/dot_grad_kernel.cu     |   8 +-
 paddle/phi/kernels/gpu/dot_kernel.cu          |   3 +-
 paddle/phi/kernels/gpu/dropout_grad_kernel.cu |   3 +-
 paddle/phi/kernels/gpu/dropout_kernel.cu      |   3 +-
 paddle/phi/kernels/gpu/eigh_grad_kernel.cu    |   5 +-
 paddle/phi/kernels/gpu/eigh_kernel.cu         |   5 +-
 paddle/phi/kernels/gpu/einsum_grad_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/einsum_kernel.cu       |   3 +-
 .../gpu/elementwise_add_grad_kernel.cu        |   3 +-
 .../gpu/elementwise_divide_grad_kernel.cu     |   3 +-
 paddle/phi/kernels/gpu/elementwise_grad.h     |  35 +-
 .../kernels/gpu/elementwise_grad_kernel.cu    |   3 +-
 .../gpu/elementwise_multiply_grad_kernel.cu   |   3 +-
 .../gpu/elementwise_subtract_grad_kernel.cu   |   3 +-
 .../phi/kernels/gpu/embedding_grad_kernel.cu  |  10 +-
 paddle/phi/kernels/gpu/embedding_kernel.cu    |   5 +-
 paddle/phi/kernels/gpu/erfinv_grad_kernel.cu  |   5 +-
 paddle/phi/kernels/gpu/erfinv_kernel.cu       |   5 +-
 .../phi/kernels/gpu/expand_as_grad_kernel.cu  |   5 +-
 paddle/phi/kernels/gpu/expand_as_kernel.cu    |   5 +-
 paddle/phi/kernels/gpu/eye_kernel.cu          |   5 +-
 paddle/phi/kernels/gpu/flip_kernel.cu         |  21 +-
 .../kernels/gpu/frobenius_norm_grad_kernel.cu |   3 +-
 .../phi/kernels/gpu/frobenius_norm_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/full_kernel.cu         |   3 +-
 paddle/phi/kernels/gpu/gather_grad_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/gather_kernel.cu       |   3 +-
 paddle/phi/kernels/gpu/gather_tree_kernel.cu  |   4 +-
 .../phi/kernels/gpu/gaussian_random_kernel.cu |   7 +-
 paddle/phi/kernels/gpu/gelu_funcs.h           |  16 +-
 paddle/phi/kernels/gpu/gelu_grad_kernel.cu    |   2 +
 paddle/phi/kernels/gpu/gelu_kernel.cu         |   4 +
 paddle/phi/kernels/gpu/graph_reindex_funcs.h  |   3 +-
 .../phi/kernels/gpu/graph_reindex_kernel.cu   |  24 +-
 .../gpu/graph_sample_neighbors_kernel.cu      |  73 +-
 .../phi/kernels/gpu/graph_send_recv_funcs.h   |   4 +-
 .../gpu/graph_send_recv_grad_kernel.cu        |  32 +-
 .../phi/kernels/gpu/graph_send_recv_kernel.cu |  42 +-
 .../kernels/gpu/grid_sample_grad_kernel.cu    |  42 +-
 paddle/phi/kernels/gpu/grid_sample_kernel.cu  |  33 +-
 .../kernels/gpu/gumbel_softmax_grad_kernel.cu |   3 +-
 .../phi/kernels/gpu/gumbel_softmax_kernel.cu  |  21 +-
 paddle/phi/kernels/gpu/histogram_kernel.cu    |  11 +-
 paddle/phi/kernels/gpu/increment_kernel.cu    |   3 +-
 .../kernels/gpu/index_sample_grad_kernel.cu   |  36 +-
 paddle/phi/kernels/gpu/index_sample_kernel.cu |   4 +-
 .../kernels/gpu/index_select_grad_kernel.cu   |  40 +-
 paddle/phi/kernels/gpu/index_select_kernel.cu |   3 +-
 .../kernels/gpu/instance_norm_grad_kernel.cu  |  79 +-
 .../phi/kernels/gpu/instance_norm_kernel.cu   |   3 +-
 .../kernels/gpu/interpolate_grad_kernel.cu    |  67 +-
 paddle/phi/kernels/gpu/interpolate_kernel.cu  |  74 +-
 paddle/phi/kernels/gpu/isclose_kernel.cu      |   3 +-
 .../phi/kernels/gpu/kldiv_loss_grad_kernel.cu |   3 +-
 paddle/phi/kernels/gpu/kldiv_loss_kernel.cu   |   3 +-
 paddle/phi/kernels/gpu/kron_grad_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/kron_kernel.cu         |   3 +-
 .../phi/kernels/gpu/kthvalue_grad_kernel.cu   |   9 +-
 paddle/phi/kernels/gpu/kthvalue_kernel.cu     |   9 +-
 paddle/phi/kernels/gpu/label_smooth_kernel.cu |   1 +
 .../phi/kernels/gpu/layer_norm_grad_kernel.cu |   3 +-
 paddle/phi/kernels/gpu/layer_norm_kernel.cu   |  74 +-
 paddle/phi/kernels/gpu/lgamma_grad_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/lgamma_kernel.cu       |   3 +-
 paddle/phi/kernels/gpu/linspace_kernel.cu     |   3 +-
 .../phi/kernels/gpu/log_loss_grad_kernel.cu   |   3 +-
 paddle/phi/kernels/gpu/log_loss_kernel.cu     |   3 +-
 .../kernels/gpu/log_softmax_grad_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/log_softmax_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/logspace_kernel.cu     |   7 +-
 .../phi/kernels/gpu/logsumexp_grad_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/logsumexp_kernel.cu    |   4 +-
 .../phi/kernels/gpu/masked_select_kernel.cu   |   5 +-
 paddle/phi/kernels/gpu/matmul_grad_kernel.cu  |   4 +-
 paddle/phi/kernels/gpu/matmul_kernel.cu       |   6 +-
 .../kernels/gpu/matrix_power_grad_kernel.cu   |   3 +-
 paddle/phi/kernels/gpu/matrix_power_kernel.cu |   3 +-
 paddle/phi/kernels/gpu/matrix_rank_kernel.cu  |   5 +-
 .../phi/kernels/gpu/matrix_rank_tol_kernel.cu |   4 +-
 paddle/phi/kernels/gpu/maxout_grad_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/maxout_kernel.cu       |   3 +-
 .../phi/kernels/gpu/mean_all_grad_kernel.cu   |   3 +-
 paddle/phi/kernels/gpu/mean_all_kernel.cu     |   6 +-
 .../kernels/gpu/meshgrid_grad_kernel.cu.cc    |   2 +-
 paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc  |   2 +-
 paddle/phi/kernels/gpu/mode_grad_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/mode_kernel.cu         |   3 +-
 paddle/phi/kernels/gpu/momentum_kernel.cu     |   3 +-
 .../phi/kernels/gpu/multi_dot_grad_kernel.cu  |   5 +-
 paddle/phi/kernels/gpu/multi_dot_kernel.cu    |   5 +-
 paddle/phi/kernels/gpu/multinomial_kernel.cu  |  30 +-
 .../phi/kernels/gpu/multiplex_grad_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/multiplex_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/mv_grad_kernel.cu      |   9 +-
 paddle/phi/kernels/gpu/mv_kernel.cu           |   4 +-
 .../phi/kernels/gpu/nanmedian_grad_kernel.cu  |   9 +-
 paddle/phi/kernels/gpu/nanmedian_kernel.cu    |  49 +-
 paddle/phi/kernels/gpu/nll_loss.h             |   2 +
 .../phi/kernels/gpu/nll_loss_grad_kernel.cu   |  83 +-
 paddle/phi/kernels/gpu/nll_loss_kernel.cu     |  85 +-
 paddle/phi/kernels/gpu/norm_grad_kernel.cu    |   9 +-
 paddle/phi/kernels/gpu/norm_kernel.cu         |   9 +-
 paddle/phi/kernels/gpu/one_hot_kernel.cu      |   3 +-
 paddle/phi/kernels/gpu/p_norm_grad_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/p_norm_kernel.cu       |   3 +-
 paddle/phi/kernels/gpu/pad3d_grad_kernel.cu   |   3 +-
 paddle/phi/kernels/gpu/pad3d_kernel.cu        |   3 +-
 paddle/phi/kernels/gpu/pad_grad_kernel.cu     |   3 +-
 .../kernels/gpu/pixel_shuffle_grad_kernel.cu  |   5 +-
 .../phi/kernels/gpu/pixel_shuffle_kernel.cu   |   5 +-
 .../gpu/pixel_unshuffle_grad_kernel.cu        |   5 +-
 .../phi/kernels/gpu/pixel_unshuffle_kernel.cu |   5 +-
 paddle/phi/kernels/gpu/pool_grad_kernel.cu    |   6 +-
 paddle/phi/kernels/gpu/pool_kernel.cu         |   6 +-
 paddle/phi/kernels/gpu/prelu_funcs.h          |   1 +
 paddle/phi/kernels/gpu/prelu_grad_kernel.cu   |  27 +-
 paddle/phi/kernels/gpu/prelu_kernel.cu        |   3 +-
 .../phi/kernels/gpu/psroi_pool_grad_kernel.cu |   4 +-
 paddle/phi/kernels/gpu/psroi_pool_kernel.cu   |  30 +-
 .../kernels/gpu/put_along_axis_grad_kernel.cu |   3 +-
 .../phi/kernels/gpu/put_along_axis_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/randint_kernel.cu      |   3 +-
 paddle/phi/kernels/gpu/randperm_kernel.cu     |   2 +
 paddle/phi/kernels/gpu/reduce.h               |   6 +-
 paddle/phi/kernels/gpu/reduce_any_kernel.cu   |   3 +-
 .../phi/kernels/gpu/reduce_max_grad_kernel.cu |   3 +-
 .../kernels/gpu/reduce_mean_grad_kernel.cu    |   3 +-
 .../phi/kernels/gpu/reduce_min_grad_kernel.cu |   3 +-
 .../kernels/gpu/reduce_prod_grad_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/reduce_prod_kernel.cu  |   3 +-
 .../phi/kernels/gpu/reduce_sum_grad_kernel.cu |   3 +-
 paddle/phi/kernels/gpu/rmsprop_kernel.cu      |   3 +-
 paddle/phi/kernels/gpu/rnn_functor.h          |   5 +-
 paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc  |   4 +-
 paddle/phi/kernels/gpu/rnn_kernel.cu.cc       |   6 +-
 .../phi/kernels/gpu/roi_align_grad_kernel.cu  |  38 +-
 paddle/phi/kernels/gpu/roi_align_kernel.cu    |  34 +-
 .../phi/kernels/gpu/roi_pool_grad_kernel.cu   |  36 +-
 paddle/phi/kernels/gpu/roi_pool_kernel.cu     |  56 +-
 paddle/phi/kernels/gpu/roll_grad_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/roll_kernel.cu         |   3 +-
 paddle/phi/kernels/gpu/roll_kernel_impl.h     |  33 +-
 paddle/phi/kernels/gpu/rrelu_grad_kernel.cu   |   9 +-
 paddle/phi/kernels/gpu/scale_kernel.cu        |   3 +-
 paddle/phi/kernels/gpu/searchsorted_kernel.cu |   3 +-
 .../kernels/gpu/segment_pool_grad_kernel.cu   |   5 +-
 paddle/phi/kernels/gpu/segment_pool_kernel.cu |   5 +-
 paddle/phi/kernels/gpu/selu_grad_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/selu_kernel.cu         |   3 +-
 .../phi/kernels/gpu/set_value_grad_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/set_value_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/sgd_kernel.cu          |   6 +-
 paddle/phi/kernels/gpu/shard_index_kernel.cu  |  15 +-
 .../gpu/sigmoid_cross_entropy_with_logits.h   |   1 +
 ...d_cross_entropy_with_logits_grad_kernel.cu |   3 +-
 ...igmoid_cross_entropy_with_logits_kernel.cu |   3 +-
 paddle/phi/kernels/gpu/sign_kernel.cu.cc      |   2 +-
 paddle/phi/kernels/gpu/size_kernel.cu         |   5 +-
 .../phi/kernels/gpu/slice_grad_kernel.cu.cc   |   2 +-
 paddle/phi/kernels/gpu/slice_kernel.cu.cc     |   2 +-
 paddle/phi/kernels/gpu/softmax_grad_kernel.cu |   3 +-
 paddle/phi/kernels/gpu/softmax_kernel.cu      |   3 +-
 paddle/phi/kernels/gpu/split_kernel.cu        |   4 +-
 paddle/phi/kernels/gpu/squeeze_grad_kernel.cu |   3 +-
 paddle/phi/kernels/gpu/squeeze_kernel.cu      |   3 +-
 paddle/phi/kernels/gpu/stack_grad_kernel.cu   |  43 +-
 paddle/phi/kernels/gpu/stack_kernel.cu        |  39 +-
 .../kernels/gpu/strided_slice_grad_kernel.cu  |   3 +-
 .../phi/kernels/gpu/strided_slice_kernel.cu   |   3 +-
 .../gpu/take_along_axis_grad_kernel.cu        |   3 +-
 .../phi/kernels/gpu/take_along_axis_kernel.cu |   3 +-
 paddle/phi/kernels/gpu/tile_grad_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/tile_kernel.cu         |   3 +-
 paddle/phi/kernels/gpu/top_k_grad_kernel.cu   |   9 +-
 paddle/phi/kernels/gpu/top_k_kernel.cu        | 141 ++-
 paddle/phi/kernels/gpu/trace_grad_kernel.cu   |   3 +-
 paddle/phi/kernels/gpu/trace_kernel.cu        |   3 +-
 .../phi/kernels/gpu/transpose_grad_kernel.cu  |   5 +-
 paddle/phi/kernels/gpu/transpose_kernel.cu    |   5 +-
 .../kernels/gpu/triangular_solve_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/tril_indices_kernel.cu |   3 +-
 .../phi/kernels/gpu/tril_triu_grad_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/tril_triu_kernel.cu    |   3 +-
 .../gpu/truncated_gaussian_random_kernel.cu   |   4 +-
 paddle/phi/kernels/gpu/unbind_kernel.cu       |   3 +-
 .../phi/kernels/gpu/uniform_random_kernel.cu  |   5 +-
 paddle/phi/kernels/gpu/unique_kernel.cu       |   5 +-
 .../phi/kernels/gpu/unsqueeze_grad_kernel.cu  |   3 +-
 paddle/phi/kernels/gpu/unsqueeze_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/unstack_grad_kernel.cu |   3 +-
 paddle/phi/kernels/gpu/unstack_kernel.cu      |   3 +-
 .../phi/kernels/gpu/viterbi_decode_kernel.cu  |  34 +-
 paddle/phi/kernels/gpu/warpctc_grad_kernel.cu |   3 +-
 paddle/phi/kernels/gpu/warpctc_kernel.cu      |   3 +-
 paddle/phi/kernels/gpu/where_grad_kernel.cu   |   9 +-
 paddle/phi/kernels/gpu/where_index_kernel.cu  |   7 +-
 paddle/phi/kernels/gpu/where_kernel.cu        |   3 +-
 .../kernels/gpudnn/conv_grad_grad_kernel.cu   |  19 +-
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu |  18 +-
 paddle/phi/kernels/gpudnn/conv_kernel.cu      |  20 +-
 .../gpudnn/conv_transpose_grad_kernel.cu      |   4 +-
 .../kernels/gpudnn/conv_transpose_kernel.cu   |   4 +-
 paddle/phi/kernels/gpudnn/pool_gpudnn.h       |   1 +
 paddle/phi/kernels/gpudnn/pool_grad_kernel.cu |   6 +-
 paddle/phi/kernels/gpudnn/pool_kernel.cu      |   6 +-
 paddle/phi/kernels/gpudnn/softmax_gpudnn.h    |  82 +-
 .../phi/kernels/gpudnn/softmax_grad_kernel.cu |   3 +-
 paddle/phi/kernels/gpudnn/softmax_kernel.cu   |   3 +-
 .../phi/kernels/graph_send_recv_grad_kernel.h |   1 +
 paddle/phi/kernels/graph_send_recv_kernel.h   |   1 +
 .../phi/kernels/impl/activation_grad_impl.h   |   3 +-
 paddle/phi/kernels/impl/activation_impl.h     |   3 +-
 paddle/phi/kernels/impl/adagrad_kernel_impl.h |   3 +-
 .../phi/kernels/impl/addmm_grad_kernel_impl.h |   4 +-
 paddle/phi/kernels/impl/addmm_kernel_impl.h   |   4 +-
 .../phi/kernels/impl/atan2_grad_kernel_impl.h |   3 +-
 paddle/phi/kernels/impl/atan2_kernel_impl.h   |   3 +-
 .../impl/broadcast_tensors_kernel_impl.h      |   4 +-
 .../kernels/impl/cholesky_grad_kernel_impl.h  |   3 +-
 .../impl/cholesky_solve_grad_kernel_impl.h    |   1 -
 .../kernels/impl/cholesky_solve_kernel_impl.h |   1 -
 .../phi/kernels/impl/clip_grad_kernel_impl.h  |   8 +-
 paddle/phi/kernels/impl/clip_kernel_impl.h    |   8 +-
 paddle/phi/kernels/impl/compare_kernel_impl.h |   3 +-
 .../kernels/impl/concat_grad_kernel_impl.h    |   3 +-
 paddle/phi/kernels/impl/conv_cudnn_impl.h     |  11 +-
 .../impl/conv_transpose_grad_kernel_impl.h    |   3 +-
 .../kernels/impl/conv_transpose_kernel_impl.h |   3 +-
 .../impl/determinant_grad_kernel_impl.h       |   3 +-
 .../kernels/impl/determinant_kernel_impl.h    |   6 +-
 .../kernels/impl/digamma_grad_kernel_impl.h   |   1 +
 paddle/phi/kernels/impl/digamma_kernel_impl.h |   1 +
 paddle/phi/kernels/impl/dist_kernel_impl.h    |   2 +
 .../phi/kernels/impl/dot_grad_kernel_impl.h   |   6 +-
 paddle/phi/kernels/impl/einsum_impl.h         |   1 +
 .../impl/frobenius_norm_grad_kernel_impl.h    |   1 -
 .../kernels/impl/frobenius_norm_kernel_impl.h |   3 +-
 .../kernels/impl/gumbel_softmax_kernel_impl.h |   1 +
 .../impl/kldiv_loss_grad_kernel_impl.h        |   1 +
 .../phi/kernels/impl/kldiv_loss_kernel_impl.h |   1 +
 .../kernels/impl/lgamma_grad_kernel_impl.h    |   1 +
 .../kernels/impl/matmul_grad_kernel_impl.h    |  10 +-
 paddle/phi/kernels/impl/matmul_kernel_impl.h  |   3 +-
 .../kernels/impl/maxout_grad_kernel_impl.h    |   3 +-
 paddle/phi/kernels/impl/maxout_kernel_impl.h  |   3 +-
 .../kernels/impl/meshgrid_grad_kernel_impl.h  |   3 +-
 .../phi/kernels/impl/meshgrid_kernel_impl.h   |   3 +-
 .../phi/kernels/impl/momentum_kernel_impl.h   |   3 +-
 .../kernels/impl/nanmedian_grad_kernel_impl.h |   3 +-
 .../phi/kernels/impl/nanmedian_kernel_impl.h  |   3 +-
 paddle/phi/kernels/impl/pad_kernel_impl.h     |   1 +
 .../impl/pixel_shuffle_grad_kernel_impl.h     |   1 +
 .../kernels/impl/pixel_shuffle_kernel_impl.h  |   1 +
 .../phi/kernels/impl/pool_grad_kernel_impl.h  |   3 +-
 paddle/phi/kernels/impl/pool_kernel_impl.h    |   4 +-
 .../impl/reduce_max_grad_kernel_impl.h        |   3 +-
 .../impl/reduce_min_grad_kernel_impl.h        |   3 +-
 .../impl/reduce_prod_grad_kernel_impl.h       |   3 +-
 paddle/phi/kernels/impl/reverse_kernel_impl.h |   3 +-
 paddle/phi/kernels/impl/rmsprop_kernel_impl.h |   3 +-
 .../impl/segment_pool_grad_kernel_impl.h      |   1 +
 .../kernels/impl/segment_pool_kernel_impl.h   |   1 +
 .../phi/kernels/impl/selu_grad_kernel_impl.h  |   3 +-
 paddle/phi/kernels/impl/selu_kernel_impl.h    |   1 +
 .../kernels/impl/set_value_grad_kernel_impl.h |   1 -
 .../phi/kernels/impl/set_value_kernel_impl.h  |   1 -
 .../phi/kernels/impl/slice_grad_kernel_impl.h |   3 +-
 .../kernels/impl/softmax_grad_kernel_impl.h   |   3 +-
 paddle/phi/kernels/impl/softmax_kernel_impl.h |   3 +-
 .../impl/strided_slice_grad_kernel_impl.h     |   3 +-
 .../kernels/impl/strided_slice_kernel_impl.h  |   3 +-
 .../impl/triangular_solve_grad_kernel_impl.h  |   3 +-
 .../kernels/impl/tril_triu_grad_kernel_impl.h |   3 +-
 .../phi/kernels/impl/tril_triu_kernel_impl.h  |   3 +-
 .../kernels/impl/unfold_grad_kernel_impl.h    |   1 +
 paddle/phi/kernels/impl/unfold_kernel_impl.h  |   1 +
 paddle/phi/kernels/kldiv_loss_kernel.h        |   1 +
 paddle/phi/kernels/kps/compare_kernel.cu      |   2 +
 paddle/phi/kernels/kps/reduce_all_kernel.cu   |   3 +-
 paddle/phi/kernels/kps/reduce_max_kernel.cu   |   3 +-
 paddle/phi/kernels/kps/reduce_mean_kernel.cu  |   3 +-
 paddle/phi/kernels/kps/reduce_min_kernel.cu   |   3 +-
 paddle/phi/kernels/kps/reduce_sum_kernel.cu   |   3 +-
 .../phi/kernels/masked_select_grad_kernel.h   |   2 +-
 paddle/phi/kernels/masked_select_kernel.h     |   2 +-
 paddle/phi/kernels/matmul_kernel.h            |   1 -
 paddle/phi/kernels/mv_kernel.h                |   2 +-
 .../phi/kernels/pixel_shuffle_grad_kernel.h   |   1 +
 paddle/phi/kernels/pixel_shuffle_kernel.h     |   1 +
 .../phi/kernels/pixel_unshuffle_grad_kernel.h |   1 +
 paddle/phi/kernels/pixel_unshuffle_kernel.h   |   1 +
 paddle/phi/kernels/pool_grad_kernel.h         |   1 +
 paddle/phi/kernels/pool_kernel.h              |   1 +
 .../kernels/primitive/compute_primitives.h    | 208 ++---
 .../primitive/compute_primitives_xpu2.h       |  44 +-
 paddle/phi/kernels/reshape_grad_kernel.cc     |   1 +
 paddle/phi/kernels/reshape_kernel.cc          |   1 +
 .../selected_rows/activation_kernel.cc        |   6 +-
 .../phi/kernels/selected_rows/full_kernel.cc  |   5 +-
 .../kernels/selected_rows/gpu/adam_kernel.cu  |  47 +-
 .../kernels/selected_rows/gpu/adamw_kernel.cu |  52 +-
 .../kernels/selected_rows/gpu/clip_kernel.cu  |   3 +-
 .../selected_rows/impl/clip_kernel_impl.h     |   3 +-
 .../phi/kernels/selected_rows/shape_kernel.cc |   1 +
 paddle/phi/kernels/selu_kernel.h              |   2 +-
 paddle/phi/kernels/shape_kernel.cc            |   1 +
 .../kernels/sparse/cpu/coalesced_kernel.cc    |   1 +
 .../sparse/cpu/convolution_grad_kernel.cc     |   1 +
 .../kernels/sparse/cpu/convolution_kernel.cc  |   4 +-
 .../kernels/sparse/cpu/sparse_mask_kernel.cc  |   6 +-
 .../sparse/cpu/sparse_pool_grad_kernel.cc     |   1 +
 .../kernels/sparse/cpu/sparse_pool_kernel.cc  |   3 +-
 .../kernels/sparse/cpu/sparse_utils_kernel.cc |   1 +
 .../kernels/sparse/gpu/coalesced_kernel.cu    |  20 +-
 .../phi/kernels/sparse/gpu/convolution.cu.h   |  34 +-
 .../sparse/gpu/convolution_grad_kernel.cu     |  18 +-
 .../kernels/sparse/gpu/convolution_kernel.cu  |  44 +-
 .../kernels/sparse/gpu/sparse_mask_kernel.cu  |   2 +-
 .../sparse/gpu/sparse_pool_grad_kernel.cu     |  27 +-
 .../kernels/sparse/gpu/sparse_pool_kernel.cu  |  25 +-
 .../kernels/sparse/gpu/sparse_utils_kernel.cu |  25 +-
 .../sparse/sparse_utils_grad_kernel.cc        |   1 +
 paddle/phi/kernels/split_kernel.h             |   3 +-
 paddle/phi/kernels/strings/case_utils.h       |   1 +
 .../strings/cpu/strings_copy_kernel.cc        |   2 +-
 .../strings/gpu/strings_copy_kernel.cu        |   8 +-
 .../strings/gpu/strings_lower_upper_kernel.cu |   8 +-
 .../strings/strings_lower_upper_kernel.h      |   3 +-
 paddle/phi/kernels/strings/unicode.cc         |   2 +
 paddle/phi/kernels/strings/unicode.h          |   1 +
 paddle/phi/kernels/transpose_grad_kernel.h    |   1 +
 paddle/phi/ops/compat/matrix_rank_sig.cc      |   4 +-
 paddle/phi/ops/compat/segment_pool_sig.cc     |   6 +-
 paddle/phi/tests/api/scale_api.h              |   1 -
 paddle/phi/tests/api/test_cast_api.cc         |   2 +-
 paddle/phi/tests/api/test_concat_api.cc       |   2 +-
 paddle/phi/tests/api/test_conj_api.cc         |   2 +-
 paddle/phi/tests/api/test_data_transform.cc   |   1 +
 paddle/phi/tests/api/test_dot_api.cc          |   2 +-
 paddle/phi/tests/api/test_elementwise_api.cc  |   2 +-
 paddle/phi/tests/api/test_embedding_api.cc    |   2 +-
 paddle/phi/tests/api/test_empty_api.cc        |   2 +-
 paddle/phi/tests/api/test_fill_api.cc         |   2 +-
 paddle/phi/tests/api/test_matmul_api.cc       |   2 +-
 paddle/phi/tests/api/test_mean_api.cc         |   2 +-
 paddle/phi/tests/api/test_pten_exception.cc   |   1 +
 paddle/phi/tests/api/test_reshape_api.cc      |   2 +-
 paddle/phi/tests/api/test_scale_api.cc        |   2 +-
 paddle/phi/tests/api/test_scale_benchmark.cc  |   2 +-
 paddle/phi/tests/api/test_slice_api.cc        |   1 +
 paddle/phi/tests/api/test_sparse_conv_api.cc  |   3 +-
 paddle/phi/tests/api/test_sparse_utils_api.cc |   3 +-
 paddle/phi/tests/api/test_split_api.cc        |   2 +-
 .../phi/tests/api/test_strings_empty_api.cc   |   1 +
 .../tests/api/test_strings_lower_upper_api.cc |   1 +
 paddle/phi/tests/api/test_sum_api.cc          |   2 +-
 paddle/phi/tests/api/test_to_api.cc           |   2 +-
 paddle/phi/tests/common/test_backend.cc       |   1 +
 paddle/phi/tests/common/test_data_layout.cc   |   1 +
 paddle/phi/tests/common/test_data_type.cc     |   1 +
 paddle/phi/tests/common/test_int_array.cc     |   7 +-
 paddle/phi/tests/common/test_place.cc         |   4 +-
 paddle/phi/tests/common/test_scalar.cu        |   1 +
 paddle/phi/tests/core/test_dense_tensor.cc    |   3 +-
 paddle/phi/tests/core/test_dim.cu             |   1 +
 paddle/phi/tests/core/test_intrusive_ptr.cc   |   1 -
 paddle/phi/tests/core/test_kernel_factory.cc  |   3 +-
 paddle/phi/tests/core/test_rw_lock.cc         |   7 +-
 paddle/phi/tests/core/test_selected_rows.cc   |   1 +
 .../phi/tests/core/test_sparse_coo_tensor.cc  |   3 +-
 .../phi/tests/core/test_sparse_csr_tensor.cc  |   1 -
 paddle/phi/tests/core/test_string_tensor.cc   |   2 +-
 paddle/phi/tests/core/test_type_info.cc       |   1 -
 .../phi/tests/core/unroll_array_ops_test.cc   |   1 +
 paddle/phi/tests/kernels/test_cast_dev_api.cc |   6 +-
 .../phi/tests/kernels/test_concat_dev_api.cc  |   4 +-
 paddle/phi/tests/kernels/test_conj_dev_api.cc |   6 +-
 paddle/phi/tests/kernels/test_copy_dev_api.cc |   8 +-
 .../tests/kernels/test_creation_dev_api.cc    |   8 +-
 paddle/phi/tests/kernels/test_dot_dev_api.cc  |   6 +-
 .../tests/kernels/test_elementwise_dev_api.cc |  10 +-
 .../phi/tests/kernels/test_flatten_dev_api.cc |   6 +-
 .../phi/tests/kernels/test_matmul_dev_api.cc  |   4 +-
 paddle/phi/tests/kernels/test_mean_dev_api.cc |   4 +-
 .../phi/tests/kernels/test_reshape_dev_api.cc |   4 +-
 .../phi/tests/kernels/test_scale_dev_api.cc   |   4 +-
 .../kernels/test_sparse_activation_dev_api.cc |   6 +-
 .../kernels/test_sparse_conv3d_dev_api.cc     |   8 +-
 .../tests/kernels/test_sparse_pool_dev_api.cc |  59 +-
 .../kernels/test_sparse_utils_dev_api.cc      |  11 +-
 .../phi/tests/kernels/test_split_dev_api.cc   |   4 +-
 .../kernels/test_strings_copy_dev_api.cc      |   2 +
 .../kernels/test_strings_copy_dev_api.cu      |   2 +
 .../test_strings_lower_upper_dev_api.cc       |   4 +-
 .../test_strings_lower_upper_dev_api.cu       |   3 +-
 paddle/phi/tests/kernels/test_sum_dev_api.cc  |   4 +-
 paddle/phi/tests/ops/test_op_signature.cc     |   1 +
 paddle/phi/tests/ops/test_op_signature.h      |   1 +
 paddle/utils/flat_hash_map.h                  |  20 +-
 paddle/utils/none.h                           |   2 +-
 paddle/utils/optional.h                       |   2 +-
 paddle/utils/small_vector.h                   |  21 +-
 paddle/utils/string/piece.cc                  |   6 +-
 paddle/utils/string/pretty_log.cc             |   1 +
 paddle/utils/string/pretty_log.h              |  14 +-
 paddle/utils/string/string_helper.cc          |   1 +
 paddle/utils/string/string_helper.h           |   1 +
 paddle/utils/string/tinyformat/tinyformat.h   |  12 +-
 paddle/utils/string/to_string_test.cc         |   1 +
 paddle/utils/variant.h                        | 231 ++---
 paddle/utils/variant_test.cc                  |   1 +
 tools/codestyle/clang_format.hook             |   7 +-
 2757 files changed, 12664 insertions(+), 10779 deletions(-)
 mode change 100755 => 100644 paddle/fluid/distributed/ps/service/brpc_ps_client.cc
 mode change 100755 => 100644 paddle/fluid/distributed/ps/service/brpc_ps_server.cc
 mode change 100755 => 100644 paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
 mode change 100755 => 100644 paddle/fluid/framework/data_feed.cc
 mode change 100755 => 100644 paddle/fluid/framework/string_array.cc
 mode change 100755 => 100644 paddle/fluid/operators/expand_as_v2_op.cc
 mode change 100755 => 100644 paddle/fluid/operators/fused/fused_dropout_act_bias.h
 mode change 100755 => 100644 paddle/fluid/operators/interpolate_op_npu.cc
 mode change 100755 => 100644 paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
 mode change 100755 => 100644 paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 42181c8f959..4b588cbeb91 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,6 +33,10 @@ repos:
         entry: bash ./tools/codestyle/clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps)$
+        exclude: |
+            (?x)^(
+                paddle/fluid/distributed/ps/thirdparty/round_robin.h
+            )$
 -   repo: local
     hooks:
     -   id: cpplint-cpp-source
diff --git a/paddle/fluid/distributed/collective/HCCLTools.cc b/paddle/fluid/distributed/collective/HCCLTools.cc
index 526a683e057..676a71cb30d 100644
--- a/paddle/fluid/distributed/collective/HCCLTools.cc
+++ b/paddle/fluid/distributed/collective/HCCLTools.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/HCCLTools.h"
+
 #include "paddle/fluid/distributed/collective/Types.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h
index a1dcf7cd9b6..4955e24eadb 100644
--- a/paddle/fluid/distributed/collective/HCCLTools.h
+++ b/paddle/fluid/distributed/collective/HCCLTools.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <error.h>
+
 #include <string>
 
 #include "boost/variant.hpp"
diff --git a/paddle/fluid/distributed/collective/NCCLTools.cc b/paddle/fluid/distributed/collective/NCCLTools.cc
index 7e842ebf921..2cecaf0734d 100644
--- a/paddle/fluid/distributed/collective/NCCLTools.cc
+++ b/paddle/fluid/distributed/collective/NCCLTools.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/NCCLTools.h"
+
 #include "paddle/fluid/distributed/collective/Types.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h
index 0454518b183..f38ce8faa7f 100644
--- a/paddle/fluid/distributed/collective/NCCLTools.h
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
@@ -16,9 +16,11 @@
 
 #include <cuda_runtime.h>
 #include <error.h>
+
 #include <string>
 
 #include "boost/variant.hpp"
+#include "paddle/fluid/distributed/collective/Types.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -26,8 +28,6 @@
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#include "paddle/fluid/distributed/collective/Types.h"
-
 namespace paddle {
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index 52e09792d5d..7ed6b188fd2 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -21,7 +21,6 @@
 
 #include "paddle/fluid/distributed/collective/Types.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
-
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index 824341c3cd9..1a390e38755 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -27,6 +27,7 @@
 #include <gloo/broadcast.h>
 #include <gloo/reduce.h>
 #include <gloo/scatter.h>
+
 #include "paddle/fluid/distributed/collective/Common.h"
 #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
@@ -485,8 +486,9 @@ std::shared_ptr<::gloo::transport::Device>
 ProcessGroupGloo::createDefaultDevice() {
   std::array<char, HOST_NAME_MAX> hostname{};
   auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
-                                "Get hostname error for createDefaultDevice."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::Fatal("Get hostname error for createDefaultDevice."));
   ::addrinfo* result;
   result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
   ::addrinfo* cur;
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
index 9ed6c2198df..50249b03967 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+
 #include "paddle/fluid/distributed/collective/Common.h"
 #include "paddle/fluid/distributed/collective/HCCLTools.h"
 #include "paddle/fluid/memory/malloc.h"
@@ -216,15 +217,16 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::AllReduce(
     std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
     std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
     const AllreduceOptions& opts) {
-  return Collective(in_tensors, out_tensors,
-                    [&](phi::DenseTensor& input, phi::DenseTensor& output,
-                        HcclComm comm, const aclrtStream& stream) {
-                      return platform::dynload::HcclAllReduce(
-                          input.data(), output.data(), input.numel(),
-                          platform::ToHCCLDataType(input.dtype()),
-                          ToHCCLRedType(opts.reduce_op), comm, stream);
-                    },
-                    CommType::ALLREDUCE);
+  return Collective(
+      in_tensors, out_tensors,
+      [&](phi::DenseTensor& input, phi::DenseTensor& output, HcclComm comm,
+          const aclrtStream& stream) {
+        return platform::dynload::HcclAllReduce(
+            input.data(), output.data(), input.numel(),
+            platform::ToHCCLDataType(input.dtype()),
+            ToHCCLRedType(opts.reduce_op), comm, stream);
+      },
+      CommType::ALLREDUCE);
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Broadcast(
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
index 2f0ff6b9565..a32984798fe 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
@@ -21,12 +21,11 @@
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #include "paddle/fluid/platform/device_context.h"
-
-#include "paddle/fluid/distributed/collective/HCCLTools.h"
-#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
index 0911a4a3e3e..0b388a6a848 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
+
 #include <chrono>
+
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/api/include/api.h"
@@ -129,8 +131,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::AllReduce(
             gid_, {dense_cpu_tensor.name()}, send_size, dense_cpu_tensor.data(),
             dense_cpu_tensor.numel() *
                 framework::DataTypeSize(dense_cpu_tensor.dtype()));
-        PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                      "Send to the switch module error."));
+        PADDLE_ENFORCE_EQ(ret, 0,
+                          platform::errors::PreconditionNotMet(
+                              "Send to the switch module error."));
         phi::DenseTensor cpu_tensor2;
         cpu_tensor2.AllocateFrom(
             std::make_unique<paddle::experimental::DefaultAllocator>(
@@ -140,8 +143,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::AllReduce(
         ret = client_->Recv(
             gid_, {dense_cpu_tensor.name()}, cpu_tensor2.data(),
             cpu_tensor2.numel() * framework::DataTypeSize(cpu_tensor2.dtype()));
-        PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                      "Recv from the switch module error."));
+        PADDLE_ENFORCE_EQ(ret, 0,
+                          platform::errors::PreconditionNotMet(
+                              "Recv from the switch module error."));
 
         switch (dense_cpu_tensor.dtype()) {
           case DataType::FLOAT32:
@@ -226,8 +230,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
               dense_cpu_tensor.data(),
               dense_cpu_tensor.numel() *
                   framework::DataTypeSize(dense_cpu_tensor.dtype()));
-          PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                        "Send to the switch module error."));
+          PADDLE_ENFORCE_EQ(ret, 0,
+                            platform::errors::PreconditionNotMet(
+                                "Send to the switch module error."));
         } else {
           int ret = client_->Recv(
               gid_, {dense_cpu_tensor.name()}, dense_cpu_tensor.data(),
@@ -286,8 +291,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Send(
   VLOG(2) << "tensor_name:" << tensor_name;
   int ret = client_->Send(gid_, {tensor_name}, send_size, cpu_tensor.data(),
                           tensor_size);
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "Send to the switch module error."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::PreconditionNotMet("Send to the switch module error."));
   return CreateTask(rank_, CommType::SEND, in_tensors);
 }
 
@@ -319,8 +325,9 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Recv(
   int ret = client_->Recv(
       gid_, {tensor_name}, cpu_tensor.data(),
       cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype()));
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "receive to the switch module error."));
+  PADDLE_ENFORCE_EQ(ret, 0,
+                    platform::errors::PreconditionNotMet(
+                        "receive to the switch module error."));
   auto end = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double> diff = end - start;
   double goodput = cpu_tensor.numel() *
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index f1b66864b29..dc67205c78f 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+
 #include "paddle/fluid/distributed/collective/Common.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -320,15 +321,16 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCudaPlace(in_tensors), true,
       platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
-  return Collective(in_tensors, out_tensors,
-                    [&](const phi::DenseTensor& input, phi::DenseTensor& output,
-                        ncclComm_t comm, const gpuStream_t& stream) {
-                      return platform::dynload::ncclAllReduce(
-                          input.data(), output.data(), input.numel(),
-                          platform::ToNCCLDataType(input.type()),
-                          ToNCCLRedType(opts.reduce_op), comm, stream);
-                    },
-                    CommType::ALLREDUCE);
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const phi::DenseTensor& input, phi::DenseTensor& output,
+          ncclComm_t comm, const gpuStream_t& stream) {
+        return platform::dynload::ncclAllReduce(
+            input.data(), output.data(), input.numel(),
+            platform::ToNCCLDataType(input.type()),
+            ToNCCLRedType(opts.reduce_op), comm, stream);
+      },
+      CommType::ALLREDUCE);
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
@@ -338,17 +340,17 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
       CheckTensorsInCudaPlace(in_tensors), true,
       platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
 
-  return Collective(in_tensors, out_tensors,
-                    [&](phi::DenseTensor& input, phi::DenseTensor& output,
-                        ncclComm_t comm, const gpuStream_t& stream) {
-                      const auto root = opts.source_rank * in_tensors.size() +
-                                        opts.source_root;
-                      return platform::dynload::ncclBroadcast(
-                          input.data(), output.data(), input.numel(),
-                          platform::ToNCCLDataType(input.type()), root, comm,
-                          stream);
-                    },
-                    CommType::BROADCAST);
+  return Collective(
+      in_tensors, out_tensors,
+      [&](phi::DenseTensor& input, phi::DenseTensor& output, ncclComm_t comm,
+          const gpuStream_t& stream) {
+        const auto root =
+            opts.source_rank * in_tensors.size() + opts.source_root;
+        return platform::dynload::ncclBroadcast(
+            input.data(), output.data(), input.numel(),
+            platform::ToNCCLDataType(input.type()), root, comm, stream);
+      },
+      CommType::BROADCAST);
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
@@ -400,15 +402,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
     std::vector<phi::DenseTensor>& tensors, int dst_rank) {
   CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
 
-  auto task = PointToPoint(tensors,
-                           [&](phi::DenseTensor& input, ncclComm_t comm,
-                               const gpuStream_t& stream, int dst_rank) {
-                             return platform::dynload::ncclSend(
-                                 input.data(), input.numel(),
-                                 platform::ToNCCLDataType(input.dtype()),
-                                 dst_rank, comm, stream);
-                           },
-                           dst_rank, CommType::SEND);
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& input, ncclComm_t comm, const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(), input.numel(),
+            platform::ToNCCLDataType(input.dtype()), dst_rank, comm, stream);
+      },
+      dst_rank, CommType::SEND);
   return task;
 }
 
@@ -416,15 +418,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
     std::vector<phi::DenseTensor>& tensors, int src_rank) {
   CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
 
-  auto task = PointToPoint(tensors,
-                           [&](phi::DenseTensor& output, ncclComm_t comm,
-                               const gpuStream_t& stream, int src_rank) {
-                             return platform::dynload::ncclRecv(
-                                 output.data(), output.numel(),
-                                 platform::ToNCCLDataType(output.dtype()),
-                                 src_rank, comm, stream);
-                           },
-                           src_rank, CommType::RECV);
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(), output.numel(),
+            platform::ToNCCLDataType(output.dtype()), src_rank, comm, stream);
+      },
+      src_rank, CommType::RECV);
   return task;
 }
 
@@ -440,15 +442,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
   std::vector<phi::DenseTensor> shared_tensors;
   shared_tensors.push_back(shared_input);
 
-  auto task = PointToPoint(shared_tensors,
-                           [&](phi::DenseTensor& input, ncclComm_t comm,
-                               const gpuStream_t& stream, int dst_rank) {
-                             return platform::dynload::ncclSend(
-                                 input.data(), input.numel(),
-                                 platform::ToNCCLDataType(input.dtype()),
-                                 dst_rank, comm, stream);
-                           },
-                           dst_rank, CommType::SEND);
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& input, ncclComm_t comm, const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(), input.numel(),
+            platform::ToNCCLDataType(input.dtype()), dst_rank, comm, stream);
+      },
+      dst_rank, CommType::SEND);
   return task;
 }
 
@@ -463,15 +465,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
   std::vector<phi::DenseTensor> shared_tensors;
   shared_tensors.push_back(shared_input);
 
-  auto task = PointToPoint(shared_tensors,
-                           [&](phi::DenseTensor& output, ncclComm_t comm,
-                               const gpuStream_t& stream, int src_rank) {
-                             return platform::dynload::ncclRecv(
-                                 output.data(), output.numel(),
-                                 platform::ToNCCLDataType(output.dtype()),
-                                 src_rank, comm, stream);
-                           },
-                           src_rank, CommType::RECV);
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(), output.numel(),
+            platform::ToNCCLDataType(output.dtype()), src_rank, comm, stream);
+      },
+      src_rank, CommType::RECV);
   return task;
 }
 
@@ -484,15 +486,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCudaPlace(out_tensors), true,
       platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
-  return Collective(in_tensors, out_tensors,
-                    [&](const phi::DenseTensor& input, phi::DenseTensor& output,
-                        ncclComm_t comm, const gpuStream_t& stream) {
-                      return platform::dynload::ncclAllGather(
-                          input.data(), output.data(), input.numel(),
-                          platform::ToNCCLDataType(input.dtype()), comm,
-                          stream);
-                    },
-                    CommType::ALLGATHER);
+  return Collective(
+      in_tensors, out_tensors,
+      [&](const phi::DenseTensor& input, phi::DenseTensor& output,
+          ncclComm_t comm, const gpuStream_t& stream) {
+        return platform::dynload::ncclAllGather(
+            input.data(), output.data(), input.numel(),
+            platform::ToNCCLDataType(input.dtype()), comm, stream);
+      },
+      CommType::ALLGATHER);
 }
 
 void* GetPointerByOffset(void* raw_pointer, size_t offset,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 82ced6e135a..2325e645b4c 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -22,10 +22,9 @@
 #include <vector>
 
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
-
-#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 96009ce7229..9c04b95a732 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -403,8 +403,9 @@ void EagerReducer::InitializeDenseGroups(
                           "Tensor %s is not initialized.", tensor_name));
     const auto size = tensor.numel();
     PADDLE_ENFORCE_GT(
-        size, 0, platform::errors::PreconditionNotMet(
-                     "The number of tensor %s's elements is 0.", tensor_name));
+        size, 0,
+        platform::errors::PreconditionNotMet(
+            "The number of tensor %s's elements is 0.", tensor_name));
     all_length += size;
 
     p_group->length_.push_back(size);
diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h
index 424bae0e5ac..0527ceb9b51 100644
--- a/paddle/fluid/distributed/collective/reducer.h
+++ b/paddle/fluid/distributed/collective/reducer.h
@@ -16,6 +16,7 @@
 
 #include <map>
 #include <vector>
+
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/utils/hook_utils.h"
diff --git a/paddle/fluid/distributed/common/afs_warpper.cc b/paddle/fluid/distributed/common/afs_warpper.cc
index d539ec60804..3a37c6be7c2 100644
--- a/paddle/fluid/distributed/common/afs_warpper.cc
+++ b/paddle/fluid/distributed/common/afs_warpper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/common/afs_warpper.h"
+
 #include "paddle/fluid/framework/io/fs.h"
 
 namespace paddle {
@@ -27,9 +28,10 @@ int AfsClient::initialize(const FsClientParameter& fs_client_param) {
 int AfsClient::initialize(const std::string& hadoop_bin, const std::string& uri,
                           const std::string& user, const std::string& passwd,
                           int buffer_size_param) {
-  return initialize(hadoop_bin, uri, paddle::string::format_string(
-                                         "%s,%s", user.c_str(), passwd.c_str()),
-                    buffer_size_param);
+  return initialize(
+      hadoop_bin, uri,
+      paddle::string::format_string("%s,%s", user.c_str(), passwd.c_str()),
+      buffer_size_param);
 }
 int AfsClient::initialize(const std::string& hadoop_bin, const std::string& uri,
                           const std::string& ugi, int buffer_size_param) {
diff --git a/paddle/fluid/distributed/common/afs_warpper.h b/paddle/fluid/distributed/common/afs_warpper.h
index d10668046c0..cef3e5ae35c 100644
--- a/paddle/fluid/distributed/common/afs_warpper.h
+++ b/paddle/fluid/distributed/common/afs_warpper.h
@@ -19,6 +19,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/distributed/common/cost_timer.h b/paddle/fluid/distributed/common/cost_timer.h
index 5073dc9cf50..1651121ee0c 100644
--- a/paddle/fluid/distributed/common/cost_timer.h
+++ b/paddle/fluid/distributed/common/cost_timer.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
 #include <unordered_map>
+
 #include "butil/time.h"
 #include "bvar/latency_recorder.h"
 #include "glog/logging.h"
diff --git a/paddle/fluid/distributed/common/local_random.h b/paddle/fluid/distributed/common/local_random.h
index 96b8d2d21a5..5a9a3b595d0 100644
--- a/paddle/fluid/distributed/common/local_random.h
+++ b/paddle/fluid/distributed/common/local_random.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <assert.h>
 #include <time.h>
+
 #include <atomic>
 #include <random>
 
diff --git a/paddle/fluid/distributed/common/registerer.h b/paddle/fluid/distributed/common/registerer.h
index 630be930c14..f4938c0f93f 100644
--- a/paddle/fluid/distributed/common/registerer.h
+++ b/paddle/fluid/distributed/common/registerer.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <glog/logging.h>
+
 #include <iostream>
 #include <map>
 #include <string>
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 53bae87c002..754a3f5d2b2 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+
 #include <algorithm>
 
-#include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
@@ -148,8 +149,9 @@ void Carrier::WakeUp() {
 }
 
 void Carrier::Start() {
-  PADDLE_ENFORCE_EQ(is_init_, true, platform::errors::PreconditionNotMet(
-                                        "Using carrier before initialized."));
+  PADDLE_ENFORCE_EQ(is_init_, true,
+                    platform::errors::PreconditionNotMet(
+                        "Using carrier before initialized."));
   for (int64_t id : source_interceptor_ids_) {
     VLOG(3) << "Carrier Start is sending start to source interceptor " << id
             << ".";
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index d35a3260915..2846af97716 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -35,7 +35,7 @@ namespace paddle {
 namespace framework {
 class Scope;
 class ProgramDesc;
-}
+}  // namespace framework
 
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index fb907e3b5c2..4ba11fa7e32 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
-#include "paddle/fluid/distributed/fleet_executor/carrier.h"
 
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index d8f937e218b..8fe73d77494 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/distributed/fleet_executor/dist_model.h"
+
 #include <glog/logging.h>
+
 #include <chrono>  // NOLINT
 
-#include "paddle/fluid/distributed/fleet_executor/dist_model.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/block_desc.h"
@@ -294,8 +296,9 @@ bool DistModel::PrepareProgram() {
 
 bool DistModel::LoadProgram() {
   VLOG(3) << "Loading program from " << config_.model_dir;
-  PADDLE_ENFORCE_NE(config_.model_dir, "", platform::errors::InvalidArgument(
-                                               "Model dir must be provided."));
+  PADDLE_ENFORCE_NE(
+      config_.model_dir, "",
+      platform::errors::InvalidArgument("Model dir must be provided."));
   std::string model_path = config_.model_dir + ".pdmodel";
   framework::proto::ProgramDesc program_proto;
   std::string pb_content;
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h
index d0203c13135..f5c1d47afb1 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.h
@@ -31,7 +31,7 @@ namespace framework {
 class ProgramDesc;
 class Scope;
 class BlockDesc;
-}
+}  // namespace framework
 
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc
index b440d39c73a..b7f590e7a8c 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
index dc8b2596803..459e609762d 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"
 
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index e946d78550f..c4d7f3c7a69 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -11,9 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+
 #include <algorithm>
 
-#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index ccdb3dcc459..176e5dab0da 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -25,7 +25,7 @@ namespace paddle {
 namespace framework {
 class ProgramDesc;
 class Scope;
-}
+}  // namespace framework
 
 namespace distributed {
 class RuntimeGraph;
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.cc b/paddle/fluid/distributed/fleet_executor/interceptor.cc
index 710ebda4124..2ff2bc04ff8 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
+
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/task_loop.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h
index 86ca7be7f44..00fe2154d28 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.h
@@ -33,7 +33,7 @@ namespace paddle {
 namespace framework {
 class Scope;
 class GarbageCollector;
-}
+}  // namespace framework
 namespace distributed {
 
 class TaskNode;
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index 80a6b4667aa..76762af9e7e 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
+
 #include <chrono>
 #include <memory>
 #include <set>
@@ -19,7 +21,6 @@
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
-#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
 namespace paddle {
@@ -28,8 +29,9 @@ namespace distributed {
 void MessageBus::Init(
     int64_t rank, const std::unordered_map<int64_t, std::string>& rank_to_addr,
     const std::string& addr) {
-  PADDLE_ENFORCE_EQ(is_init_, false, platform::errors::AlreadyExists(
-                                         "MessageBus is already init."));
+  PADDLE_ENFORCE_EQ(
+      is_init_, false,
+      platform::errors::AlreadyExists("MessageBus is already init."));
   rank_ = rank;
   is_init_ = true;
   rank_to_addr_ = rank_to_addr;
diff --git a/paddle/fluid/distributed/fleet_executor/message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc
index 1c66d83ea34..9d42b0d73db 100644
--- a/paddle/fluid/distributed/fleet_executor/message_service.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_service.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
+
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
index 614b4c37e82..a5f90062dcf 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
+
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.h b/paddle/fluid/distributed/fleet_executor/runtime_graph.h
index 1ca9f0174ed..a59a43cc200 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.h
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/platform/macros.h"
diff --git a/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc b/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc
index 77fbb23a6c7..9d9e6c03565 100644
--- a/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/sink_interceptor.h"
+
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/fleet_executor/source_interceptor.cc b/paddle/fluid/distributed/fleet_executor/source_interceptor.cc
index 78b2bed66dd..6b2fd5565ea 100644
--- a/paddle/fluid/distributed/fleet_executor/source_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/source_interceptor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/source_interceptor.h"
+
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc b/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc
index bb313ad3789..90765dbdd2d 100644
--- a/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc
@@ -31,8 +31,9 @@ TaskLoopThread::~TaskLoopThread() {
 }
 
 TaskLoop* TaskLoopThread::StartLoop() {
-  PADDLE_ENFORCE_EQ(start_, false, platform::errors::PreconditionNotMet(
-                                       "thread is already running."));
+  PADDLE_ENFORCE_EQ(
+      start_, false,
+      platform::errors::PreconditionNotMet("thread is already running."));
   start_ = true;
   thread_ = std::thread([this]() { Loop(); });
 
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc b/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc
index ed34bbb87fc..e962a29b4a1 100644
--- a/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc
@@ -30,8 +30,9 @@ TaskLoopThreadPool::TaskLoopThreadPool(int thread_num)
 TaskLoopThreadPool::~TaskLoopThreadPool() = default;
 
 void TaskLoopThreadPool::Start() {
-  PADDLE_ENFORCE_EQ(start_, false, platform::errors::PreconditionNotMet(
-                                       "thread pool is already start."));
+  PADDLE_ENFORCE_EQ(
+      start_, false,
+      platform::errors::PreconditionNotMet("thread pool is already start."));
   PADDLE_ENFORCE_GT(
       thread_num_, 0,
       platform::errors::InvalidArgument(
@@ -45,10 +46,12 @@ void TaskLoopThreadPool::Start() {
 }
 
 TaskLoop* TaskLoopThreadPool::GetLoop(int tid) {
-  PADDLE_ENFORCE_EQ(start_, true, platform::errors::PreconditionNotMet(
-                                      "thread pool must start first."));
-  PADDLE_ENFORCE_GE(tid, 0, platform::errors::OutOfRange(
-                                "tid must >= 0, but now is %d", tid));
+  PADDLE_ENFORCE_EQ(
+      start_, true,
+      platform::errors::PreconditionNotMet("thread pool must start first."));
+  PADDLE_ENFORCE_GE(
+      tid, 0,
+      platform::errors::OutOfRange("tid must >= 0, but now is %d", tid));
   PADDLE_ENFORCE_LT(tid, thread_num_,
                     platform::errors::OutOfRange(
                         "tid must < thread_num, but now tid=%d thread_num=%d",
@@ -57,8 +60,9 @@ TaskLoop* TaskLoopThreadPool::GetLoop(int tid) {
 }
 
 std::vector<TaskLoop*> TaskLoopThreadPool::GetAllLoops() {
-  PADDLE_ENFORCE_EQ(start_, true, platform::errors::PreconditionNotMet(
-                                      "thread pool must start first."));
+  PADDLE_ENFORCE_EQ(
+      start_, true,
+      platform::errors::PreconditionNotMet("thread pool must start first."));
   return loops_;
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc
index 232317333ea..00ae30d281e 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_node.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -153,15 +154,17 @@ void TaskNode::SetRunAtOffset(int64_t value) {
 
 void TaskNode::SetReplyUpPerSteps(int64_t value) {
   PADDLE_ENFORCE_GE(
-      value, 1, platform::errors::InvalidArgument(
-                    "reply_up_per_steps must >= 1, but received %ld", value));
+      value, 1,
+      platform::errors::InvalidArgument(
+          "reply_up_per_steps must >= 1, but received %ld", value));
   reply_up_per_steps_ = value;
 }
 
 void TaskNode::SetSendDownPerSteps(int64_t value) {
   PADDLE_ENFORCE_GE(
-      value, 1, platform::errors::InvalidArgument(
-                    "send_down_per_steps must >= 1, but received %ld", value));
+      value, 1,
+      platform::errors::InvalidArgument(
+          "send_down_per_steps must >= 1, but received %ld", value));
   send_down_per_steps_ = value;
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index 7dd4b545456..16e686a4401 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -26,7 +26,7 @@ namespace paddle {
 namespace framework {
 class OperatorBase;
 class OpDesc;
-}
+}  // namespace framework
 namespace distributed {
 
 class TaskNode final {
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index 35857fc86b5..bd81d3644f4 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
index 954b52693f4..4992a8b34c9 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
index 19c1d0a0d7a..54adf06fb67 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
index 78cff2606f6..3828c4478cb 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include <sys/socket.h>
 #include <time.h>
+
 #include <iostream>
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
index e909744a4b5..a78cd6955f2 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
index 0e57596bacb..53755bf1a40 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
index 8ff908f90ec..879d7e9b029 100644
--- a/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
@@ -16,7 +16,6 @@
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc
index e9c0437c829..21a1b4accc9 100644
--- a/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc
@@ -16,7 +16,6 @@
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc
index 306d11d333d..b8219322051 100644
--- a/paddle/fluid/distributed/index_dataset/index_sampler.cc
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/index_dataset/index_sampler.h"
+
 #include "paddle/fluid/framework/data_feed.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h
index 02806b814c2..a82348c9ec5 100644
--- a/paddle/fluid/distributed/index_dataset/index_sampler.h
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.cc b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
index 27aa890f760..61941ef5133 100644
--- a/paddle/fluid/distributed/index_dataset/index_wrapper.cc
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
@@ -9,15 +9,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
+
 #include <memory>
 #include <string>
 #include <thread>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/io/fs.h"
 
-#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
+#include "paddle/fluid/framework/io/fs.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.h b/paddle/fluid/distributed/index_dataset/index_wrapper.h
index 8fb8faf6c84..1c652e60bbb 100644
--- a/paddle/fluid/distributed/index_dataset/index_wrapper.h
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/distributed/index_dataset/index_dataset.pb.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -90,10 +91,11 @@ class IndexWrapper {
     }
     TreePtr tree = std::make_shared<TreeIndex>();
     int ret = tree->Load(tree_path);
-    PADDLE_ENFORCE_EQ(ret, 0, paddle::platform::errors::InvalidArgument(
-                                  "Load tree[%s] from path[%s] failed. Please "
-                                  "check whether the file exists.",
-                                  name, tree_path));
+    PADDLE_ENFORCE_EQ(ret, 0,
+                      paddle::platform::errors::InvalidArgument(
+                          "Load tree[%s] from path[%s] failed. Please "
+                          "check whether the file exists.",
+                          name, tree_path));
     tree_map.insert(std::pair<std::string, TreePtr>{name, tree});
   }
 
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
old mode 100755
new mode 100644
index 0959b651bb5..89466076b23
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+
 #include <memory>
 #include <sstream>
 #include <string>
 
-#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/framework/archive.h"
 
 static const int max_port = 65535;
@@ -245,8 +246,9 @@ int32_t BrpcPsClient::Initialize() {
 
 int DownpourBrpcClosure::check_response(size_t request_idx, int cmd_id) {
   if (_cntls[request_idx]->Failed()) {
-    LOG(ERROR) << "resquest cmd_id:" << cmd_id << " failed, "
-                                                  "err:"
+    LOG(ERROR) << "resquest cmd_id:" << cmd_id
+               << " failed, "
+                  "err:"
                << _cntls[request_idx]->ErrorText();
     return -1;
   }
@@ -263,8 +265,9 @@ int DownpourBrpcClosure::check_response(size_t request_idx, int cmd_id) {
 int DownpourBrpcClosure::check_save_response(size_t request_idx, int cmd_id) {
   int32_t feasign_size = 0;
   if (_cntls[request_idx]->Failed()) {
-    LOG(ERROR) << "resquest cmd_id:" << cmd_id << " failed, "
-                                                  "err:"
+    LOG(ERROR) << "resquest cmd_id:" << cmd_id
+               << " failed, "
+                  "err:"
                << _cntls[request_idx]->ErrorText();
     return -1;
   }
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
index e2c16d496c4..17b6bbe22ce 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <ThreadPool.h>
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
old mode 100755
new mode 100644
index 8167c37b599..d859acbb42e
--- a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
+
 #include <thread>  // NOLINT
+
 #include "butil/object_pool.h"
 #include "paddle/fluid/distributed/common/cost_timer.h"
 #include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h"
diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.h b/paddle/fluid/distributed/ps/service/brpc_utils.h
index e68e15058f7..d4332744ceb 100644
--- a/paddle/fluid/distributed/ps/service/brpc_utils.h
+++ b/paddle/fluid/distributed/ps/service/brpc_utils.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <netdb.h>
+
 #include <iostream>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
index c4b833f294e..c50f1d909cd 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
+
 #include <google/protobuf/text_format.h>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h
index 75676c39243..5f2a0cbb909 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <ThreadPool.h>
 #include <stdint.h>
+
 #include <atomic>
 #include <deque>
 #include <map>
@@ -30,6 +31,7 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
+#include "paddle/fluid/distributed/ps/service/ps_client.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
@@ -42,8 +44,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/distributed/ps/service/ps_client.h"
-
 namespace paddle {
 namespace distributed {
 class PSClient;
@@ -157,8 +157,9 @@ template <typename T>
 inline void MergeVars(const std::string &var_name,
                       const std::vector<std::shared_ptr<Variable>> &vars,
                       Scope *scope, bool merge_add = true) {
-  PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument(
-                                            "vector vars are empty."));
+  PADDLE_ENFORCE_NE(
+      vars.empty(), true,
+      platform::errors::InvalidArgument("vector vars are empty."));
   auto cpu_place = platform::CPUPlace();
   auto &var0 = vars[0];
   auto *out_var = scope->Var(var_name);
diff --git a/paddle/fluid/distributed/ps/service/env.h b/paddle/fluid/distributed/ps/service/env.h
index 162ee6f0984..0fddb17da7c 100644
--- a/paddle/fluid/distributed/ps/service/env.h
+++ b/paddle/fluid/distributed/ps/service/env.h
@@ -18,11 +18,13 @@
 #include <glog/logging.h>
 #include <netinet/in.h>
 #include <stdio.h>
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "gflags/gflags.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
index c1df490669d..ff9680044dd 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
+
 #include <algorithm>
 #include <memory>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "Eigen/Dense"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/distributed/ps/table/table.h"
@@ -149,7 +151,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
 std::future<int32_t> GraphBrpcClient::clear_nodes(uint32_t table_id,
                                                   int type_id, int idx_) {
   DownpourBrpcClosure *closure = new DownpourBrpcClosure(
-      server_size, [&, server_size = this->server_size ](void *done) {
+      server_size, [&, server_size = this->server_size](void *done) {
         int ret = 0;
         auto *closure = (DownpourBrpcClosure *)done;
         size_t fail_num = 0;
@@ -665,5 +667,5 @@ int32_t GraphBrpcClient::Initialize() {
   local_channel = NULL;
   return 0;
 }
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.h b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
index 51f14bc57cd..c038c840df9 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
@@ -15,11 +15,12 @@
 #pragma once
 
 #include <ThreadPool.h>
+
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
-#include <utility>
 #include "ThreadPool.h"
 #include "brpc/channel.h"
 #include "brpc/controller.h"
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
index 8ff12265269..5ce26b45250 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -13,13 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/service/graph_brpc_server.h"
-#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 
 #include <thread>  // NOLINT
 #include <utility>
+
 #include "butil/endpoint.h"
 #include "iomanip"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.h b/paddle/fluid/distributed/ps/service/graph_brpc_server.h
index caf728701b2..726876bef16 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
+#include <memory>
+#include <vector>
+
 #include "brpc/channel.h"
 #include "brpc/controller.h"
 #include "brpc/server.h"
-
-#include <memory>
-#include <vector>
 #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/distributed/ps/service/server.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
diff --git a/paddle/fluid/distributed/ps/service/ps_client.cc b/paddle/fluid/distributed/ps/service/ps_client.cc
index f7df99ec13c..a0216f2a795 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_client.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/service/ps_client.h"
+
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 926bb7e7c9f..adf096c8469 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/distributed/common/cost_timer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc
index bc024ed3175..b6407ccebe5 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/service/ps_local_client.h"
+
 #include "paddle/fluid/distributed/ps/table/table.h"
 
 //#define pslib_debug_dense_compress
@@ -316,5 +317,5 @@ int32_t PsLocalClient::Initialize() {
   table_ptr->Push(table_context);
   return done();
 }
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h
index 439ecf79f2f..89c2f7446ac 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.h
@@ -223,5 +223,5 @@ class PsLocalClient : public PSClient {
   float _mse = 0;
   uint16_t _push_times = 0;
 };
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/ps_local_server.h b/paddle/fluid/distributed/ps/service/ps_local_server.h
index c09f8585b65..2075e9dd2be 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_server.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_server.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/service/server.h"
 
 namespace paddle {
@@ -37,5 +38,5 @@ class PsLocalServer : public PSServer {
  private:
   virtual int32_t Initialize() { return 0; }
 };
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
index ced51b8cbe3..255c0d3d655 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h"
+
 #include <thread>  // NOLINT
+
 #include "butil/endpoint.h"
 #include "iomanip"
 #include "paddle/fluid/distributed/ps/table/table.h"
@@ -501,5 +503,5 @@ void GraphPyClient::StopServer() {
   if (status.get() == 0) stoped_ = true;
 }
 void GraphPyClient::FinalizeWorker() { this->worker_ptr->FinalizeWorker(); }
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index 55beb9b3932..7dd03401256 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <unistd.h>
+
 #include <condition_variable>  // NOLINT
 #include <fstream>
 #include <iomanip>
@@ -23,21 +24,20 @@
 #include <thread>  // NOLINT
 #include <unordered_map>
 #include <vector>
-#include "google/protobuf/text_format.h"
 
+#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
 #include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
 #include "paddle/fluid/distributed/ps/service/graph_brpc_server.h"
 #include "paddle/fluid/distributed/ps/service/ps_service/service.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -198,5 +198,5 @@ class GraphPyClient : public GraphPyService {
   std::thread* client_thread;
   bool stoped_ = false;
 };
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/ps_service/service.cc b/paddle/fluid/distributed/ps/service/ps_service/service.cc
index 9c3a06c2212..9eb5d49a405 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/service.cc
@@ -17,7 +17,9 @@
 #include <fcntl.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
+
 #include <iostream>
+
 #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h
index c044e828846..55bbbc06d87 100644
--- a/paddle/fluid/distributed/ps/service/server.h
+++ b/paddle/fluid/distributed/ps/service/server.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "butil/endpoint.h"
 #include "google/protobuf/service.h"
 #include "paddle/fluid/distributed/common/registerer.h"
diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h
index 7713c2bda29..4db8ad0a55a 100644
--- a/paddle/fluid/distributed/ps/table/accessor.h
+++ b/paddle/fluid/distributed/ps/table/accessor.h
@@ -15,8 +15,10 @@
 #pragma once
 #include <stdint.h>
 #include <stdio.h>
+
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/distributed/common/afs_warpper.h"
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index 43dee275a3d..55a9c794e8e 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -13,11 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+
 #include <time.h>
+
 #include <algorithm>
 #include <chrono>
 #include <set>
 #include <sstream>
+
 #include "paddle/fluid/distributed/common/utils.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/framework/generator.h"
@@ -212,7 +215,6 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx,
   for (size_t i = 0; i < bags.size(); i++) {
     if (bags[i].size() > 0) {
       tasks.push_back(_shards_task_pool[i]->enqueue([&, i, idx, this]() -> int {
-
         char ch[sizeof(int) * 2 + sizeof(int64_t)];
         memset(ch, 0, sizeof(int));
         memcpy(ch + sizeof(int), &idx, sizeof(int));
@@ -353,7 +355,6 @@ void GraphTable::export_partition_files(int idx, std::string file_path) {
   for (int i = 0; i < part_len; i++) {
     tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
         [&, i, idx, this]() -> int {
-
           std::string output_path =
               file_path + "partition_" + std::to_string(i);
 
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index 25bec5276e7..6dd24df921d 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -17,6 +17,7 @@
 #include <ThreadPool.h>
 #include <assert.h>
 #include <pthread.h>
+
 #include <algorithm>
 #include <cassert>
 #include <cstdio>
@@ -36,6 +37,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
 #include "paddle/fluid/distributed/ps/table/graph/class_macro.h"
@@ -670,4 +672,4 @@ struct hash<paddle::distributed::SampleKey> {
     return s.idx ^ s.node_key ^ s.sample_size;
   }
 };
-}
+}  // namespace std
diff --git a/paddle/fluid/distributed/ps/table/common_table.h b/paddle/fluid/distributed/ps/table/common_table.h
index f69d9ccbf14..280573f7194 100644
--- a/paddle/fluid/distributed/ps/table/common_table.h
+++ b/paddle/fluid/distributed/ps/table/common_table.h
@@ -19,9 +19,8 @@
 #include <mutex>               // NOLINT
 #include <set>
 
-#include "paddle/fluid/distributed/ps/table/table.h"
-
 #include "paddle/fluid/distributed/common/utils.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
index ef7311824fa..254bbb96cad 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
+
 #include <gflags/gflags.h>
+
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h
index 327c4cea760..96ec5b8398d 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h
@@ -15,7 +15,9 @@
 #pragma once
 #include <stdint.h>
 #include <stdio.h>
+
 #include <vector>
+
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
index 4b84b7e8c36..2bde5271a0c 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/ctr_double_accessor.h"
+
 #include <gflags/gflags.h>
+
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
index 5b781b2621c..3134b469604 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
@@ -15,7 +15,9 @@
 #pragma once
 #include <stdint.h>
 #include <stdio.h>
+
 #include <vector>
+
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
index 68f28640fc6..6fb6675edde 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+
 #include <gflags/gflags.h>
+
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
index 6a9f5d28f5e..c4bcd2bb3c9 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
@@ -15,7 +15,9 @@
 #pragma once
 #include <stdint.h>
 #include <stdio.h>
+
 #include <vector>
+
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
diff --git a/paddle/fluid/distributed/ps/table/depends/dense.h b/paddle/fluid/distributed/ps/table/depends/dense.h
index aea757e8d59..5e7c1cd438d 100644
--- a/paddle/fluid/distributed/ps/table/depends/dense.h
+++ b/paddle/fluid/distributed/ps/table/depends/dense.h
@@ -15,13 +15,14 @@
 #pragma once
 
 #include <math.h>  // for sqrt in CPU and CUDA
+
 #include <functional>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/distributed/common/utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/table/depends/feature_value.h b/paddle/fluid/distributed/ps/table/depends/feature_value.h
index 36dc34808bd..e6ab278787d 100644
--- a/paddle/fluid/distributed/ps/table/depends/feature_value.h
+++ b/paddle/fluid/distributed/ps/table/depends/feature_value.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
+#include <mct/hash-map.hpp>
 #include <vector>
-#include "gflags/gflags.h"
 
-#include <mct/hash-map.hpp>
+#include "gflags/gflags.h"
 #include "paddle/fluid/distributed/common/chunk_allocator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/table/depends/geo_recorder.h b/paddle/fluid/distributed/ps/table/depends/geo_recorder.h
index adab0ee344b..99530f72b1f 100644
--- a/paddle/fluid/distributed/ps/table/depends/geo_recorder.h
+++ b/paddle/fluid/distributed/ps/table/depends/geo_recorder.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <ThreadPool.h>
+
 #include <future>  // NOLINT
 #include <memory>
 #include <unordered_set>
diff --git a/paddle/fluid/distributed/ps/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h
index f46e659a88b..7c707feacec 100644
--- a/paddle/fluid/distributed/ps/table/depends/initializers.h
+++ b/paddle/fluid/distributed/ps/table/depends/initializers.h
@@ -20,10 +20,9 @@
 #include <string>
 #include <utility>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/generator.h"
-
 #include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
index 223c8fafd26..4ae3aa7459a 100644
--- a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
+++ b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
@@ -20,6 +20,7 @@
 #include <rocksdb/slice.h>
 #include <rocksdb/table.h>
 #include <rocksdb/write_batch.h>
+
 #include <iostream>
 #include <string>
 
@@ -153,5 +154,5 @@ class RocksDBHandler {
   std::vector<rocksdb::ColumnFamilyHandle*> _handles;
   rocksdb::DB* _db;
 };
-}  // distributed
-}  // paddle
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
index 004a536e8e5..f2f346232d3 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/graph/graph_edge.h"
+
 #include <cstring>
 namespace paddle {
 namespace distributed {
@@ -25,5 +26,5 @@ void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
   id_arr.push_back(id);
   weight_arr.push_back(weight);
 }
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
index 5fc785fe256..6b929af679e 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
@@ -43,5 +43,5 @@ class WeightedGraphEdgeBlob : public GraphEdgeBlob {
  protected:
   std::vector<float> weight_arr;
 };
-}
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.cc b/paddle/fluid/distributed/ps/table/graph/graph_node.cc
index 366e607261f..d966bd69653 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_node.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+
 #include <cstring>
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h
index c6c594036d4..13fdcf4c64e 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h
@@ -18,6 +18,7 @@
 #include <memory>
 #include <sstream>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h"
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc
index 8186acec1be..4f5c86db314 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h"
+
 #include <iostream>
 #include <memory>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/generator.h"
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h
index c10617022de..cf83d27d7a2 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_weighted_sampler.h
@@ -18,6 +18,7 @@
 #include <random>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/table/graph/graph_edge.h"
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/memory_dense_table.h b/paddle/fluid/distributed/ps/table/memory_dense_table.h
index 73653fbc2eb..87a3f8661ae 100644
--- a/paddle/fluid/distributed/ps/table/memory_dense_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_dense_table.h
@@ -17,7 +17,9 @@
 #include <ThreadPool.h>
 #include <assert.h>
 #include <pthread.h>
+
 #include <string>
+
 #include "Eigen/Dense"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
index 60ba5d9602e..bce9c774f12 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
@@ -17,6 +17,7 @@
 #include <assert.h>
 // #include <pthread.h>
 #include <stdint.h>
+
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
index ee6a801fa91..464f788b454 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
@@ -12,15 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
+
 #include <omp.h>
-#include <sstream>
 
-#include "paddle/fluid/distributed/common/cost_timer.h"
-#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
-#include "paddle/fluid/framework/io/fs.h"
+#include <sstream>
 
 #include "boost/lexical_cast.hpp"
 #include "glog/logging.h"
+#include "paddle/fluid/distributed/common/cost_timer.h"
+#include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/platform/enforce.h"
 
 DEFINE_bool(pserver_print_missed_key_num_every_push, false,
@@ -272,9 +273,8 @@ int32_t MemorySparseTable::Save(const std::string& dirname,
         if (_value_accesor->Save(it.value().data(), save_param)) {
           std::string format_value = _value_accesor->ParseToString(
               it.value().data(), it.value().size());
-          if (0 !=
-              write_channel->write_line(paddle::string::format_string(
-                  "%lu %s", it.key(), format_value.c_str()))) {
+          if (0 != write_channel->write_line(paddle::string::format_string(
+                       "%lu %s", it.key(), format_value.c_str()))) {
             ++retry_num;
             is_write_failed = true;
             LOG(ERROR)
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
index 6516c75a5d6..7b7a47ff998 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
@@ -17,12 +17,14 @@
 #include <ThreadPool.h>
 #include <assert.h>
 #include <pthread.h>
+
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "Eigen/Dense"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.cc b/paddle/fluid/distributed/ps/table/sparse_accessor.cc
index bc537880f1c..772ff5d1fc5 100644
--- a/paddle/fluid/distributed/ps/table/sparse_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_accessor.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
+
 #include <gflags/gflags.h>
+
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.h b/paddle/fluid/distributed/ps/table/sparse_accessor.h
index 875904847b2..5e76365901c 100644
--- a/paddle/fluid/distributed/ps/table/sparse_accessor.h
+++ b/paddle/fluid/distributed/ps/table/sparse_accessor.h
@@ -15,7 +15,9 @@
 #pragma once
 #include <stdint.h>
 #include <stdio.h>
+
 #include <vector>
+
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
index 8471b936128..a9a4c9beae2 100644
--- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
+
 #include <gflags/gflags.h>
+
 #include "glog/logging.h"
 
 DEFINE_bool(enable_show_scale_gradient, true, "enable show scale gradient");
diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
index 55a37b59419..0f7766e20a3 100644
--- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
+++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
@@ -14,8 +14,10 @@
 
 #pragma once
 #include <math.h>
+
 #include <thread>
 #include <vector>
+
 #include "glog/logging.h"                                  // for CHECK
 #include "paddle/fluid/distributed/common/local_random.h"  // for local_uniform_real_distribution
 #include "paddle/fluid/distributed/common/registerer.h"
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index b1359d1323d..7e1128baa0c 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/ssd_sparse_table.h"
+
 #include "paddle/fluid/distributed/common/cost_timer.h"
 #include "paddle/fluid/distributed/common/local_random.h"
 #include "paddle/fluid/distributed/common/topk_calculator.h"
@@ -362,9 +363,8 @@ int32_t SSDSparseTable::Save(const std::string& path,
         if (_value_accesor->Save(it.value().data(), save_param)) {
           std::string format_value = _value_accesor->ParseToString(
               it.value().data(), it.value().size());
-          if (0 !=
-              write_channel->write_line(paddle::string::format_string(
-                  "%lu %s", it.key(), format_value.c_str()))) {
+          if (0 != write_channel->write_line(paddle::string::format_string(
+                       "%lu %s", it.key(), format_value.c_str()))) {
             ++retry_num;
             is_write_failed = true;
             LOG(ERROR) << "SSDSparseTable save failed, retry it! path:"
@@ -597,9 +597,8 @@ int32_t SSDSparseTable::SaveCache(
   while (shuffled_channel->Read(data)) {
     for (auto& t : data) {
       ++feasign_size;
-      if (0 !=
-          write_channel->write_line(paddle::string::format_string(
-              "%lu %s", t.first, t.second.c_str()))) {
+      if (0 != write_channel->write_line(paddle::string::format_string(
+                   "%lu %s", t.first, t.second.c_str()))) {
         LOG(ERROR) << "Cache Table save failed, "
                       "path:"
                    << channel_config.path << ", retry it!";
diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc
index ef2eb3a746f..cfa286f1c3f 100644
--- a/paddle/fluid/distributed/ps/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -16,13 +16,11 @@
 
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/common/registerer.h"
-
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
-#include "paddle/fluid/distributed/ps/table/memory_dense_table.h"
-
 #include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
 #include "paddle/fluid/distributed/ps/table/ctr_double_accessor.h"
 #include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+#include "paddle/fluid/distributed/ps/table/memory_dense_table.h"
 #include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
 #include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
 #include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h
index 48fda782d48..0c56b48a246 100644
--- a/paddle/fluid/distributed/ps/table/table.h
+++ b/paddle/fluid/distributed/ps/table/table.h
@@ -15,11 +15,13 @@
 #pragma once
 
 #include <assert.h>
+
 #include <atomic>
 #include <future>  // NOLINT
 #include <memory>
 #include <string>
 #include <utility>
+
 #include "paddle/fluid/distributed/common/afs_warpper.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h"
diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.cc b/paddle/fluid/distributed/ps/table/tensor_accessor.cc
index 5d1f69b7463..880583f3684 100644
--- a/paddle/fluid/distributed/ps/table/tensor_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/ps/table/tensor_accessor.h"
+
 #include "Eigen/Dense"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h
index fad31d5df7f..a5225127534 100644
--- a/paddle/fluid/distributed/ps/table/tensor_accessor.h
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
index 955ba75e672..b9754d7b9de 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
+
 #include <google/protobuf/text_format.h>
 
 #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
 #include "paddle/fluid/distributed/ps/table/table.h"
-#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h
index ce109b63cce..f88c478724b 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.h
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.h
@@ -49,8 +49,8 @@ class PSCore;
 
 using framework::LoDTensor;
 using framework::Scope;
-using phi::SelectedRows;
 using framework::Variable;
+using phi::SelectedRows;
 
 using RpcCtxMap = std::unordered_map<std::string, CommContext>;
 
diff --git a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
old mode 100755
new mode 100644
index ca02ad31195..0156c0b42db
--- a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
+++ b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
@@ -49,8 +49,8 @@ class PSCore;
 
 using framework::LoDTensor;
 using framework::Scope;
-using phi::SelectedRows;
 using framework::Variable;
+using phi::SelectedRows;
 
 using RpcCtxMap = std::unordered_map<std::string, CommContext>;
 
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
index ec6f0e26a08..a46b4b32c9f 100644
--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/distributed/store/tcp_store.h"
+
 #include <chrono>
 #include <iostream>
 #include <thread>
 
-#include "paddle/fluid/distributed/store/tcp_store.h"
 #include "paddle/fluid/distributed/store/tcp_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/flags.h"
diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc
index a28cba28833..466cd11fa5d 100644
--- a/paddle/fluid/distributed/store/tcp_utils.cc
+++ b/paddle/fluid/distributed/store/tcp_utils.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/store/tcp_utils.h"
+
 #include <cerrno>
 #include <cstring>
 #include <thread>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -51,12 +53,13 @@ void close_socket(SocketType socket) {
   int n;
   n = ::getaddrinfo(node, port_cstr, &hints, &res);
   const char* gai_err = ::gai_strerror(n);
-  const char* proto =
-      (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");
-  PADDLE_ENFORCE_EQ(
-      n, 0, platform::errors::InvalidArgument(
-                "%s network %s:%s cannot be obtained. Details: %s.", proto,
-                host, port, gai_err));
+  const char* proto = (family == AF_INET    ? "IPv4"
+                       : family == AF_INET6 ? "IPv6"
+                                            : "");
+  PADDLE_ENFORCE_EQ(n, 0,
+                    platform::errors::InvalidArgument(
+                        "%s network %s:%s cannot be obtained. Details: %s.",
+                        proto, host, port, gai_err));
 
   return res;
 }
@@ -79,10 +82,11 @@ SocketType tcp_connect(const std::string host, const std::string port,
   do {
     for (::addrinfo* cur = res; cur != nullptr; cur = cur->ai_next) {
       sockfd = ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
-      PADDLE_ENFORCE_GT(sockfd, 0, platform::errors::InvalidArgument(
-                                       "Create socket to connect %s:%s failed. "
-                                       "Details: %s. ",
-                                       host, port, socket_error().message()));
+      PADDLE_ENFORCE_GT(sockfd, 0,
+                        platform::errors::InvalidArgument(
+                            "Create socket to connect %s:%s failed. "
+                            "Details: %s. ",
+                            host, port, socket_error().message()));
 
       if (::connect(sockfd, cur->ai_addr, cur->ai_addrlen) == 0) {
         retry = false;
diff --git a/paddle/fluid/distributed/store/tcp_utils.h b/paddle/fluid/distributed/store/tcp_utils.h
index 60cb3de124d..ec9f610a18c 100644
--- a/paddle/fluid/distributed/store/tcp_utils.h
+++ b/paddle/fluid/distributed/store/tcp_utils.h
@@ -29,6 +29,7 @@
 #include <chrono>
 #include <iostream>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 // Utility functions for TCP socket.
@@ -73,9 +74,10 @@ void send_bytes(SocketType socket, const T* buffer, size_t len) {
 
   while (to_send > 0) {
     auto byte_sent = ::send(socket, ptr, to_send, 0);
-    PADDLE_ENFORCE_GT(byte_sent, 0, platform::errors::InvalidArgument(
-                                        "TCP send error. Details: %s.",
-                                        socket_error().message()));
+    PADDLE_ENFORCE_GT(
+        byte_sent, 0,
+        platform::errors::InvalidArgument("TCP send error. Details: %s.",
+                                          socket_error().message()));
     to_send -= byte_sent;
     ptr += byte_sent;
   }
@@ -91,9 +93,10 @@ void receive_bytes(SocketType socket, T* buffer, size_t len) {
 
   while (to_recv > 0) {
     auto byte_received = ::recv(socket, ptr, to_recv, 0);
-    PADDLE_ENFORCE_GT(byte_received, 0, platform::errors::InvalidArgument(
-                                            "TCP receive error. Details: %s.",
-                                            socket_error().message()));
+    PADDLE_ENFORCE_GT(
+        byte_received, 0,
+        platform::errors::InvalidArgument("TCP receive error. Details: %s.",
+                                          socket_error().message()));
 
     to_recv -= byte_received;
     ptr += byte_received;
diff --git a/paddle/fluid/distributed/test/barrier_table_test.cc b/paddle/fluid/distributed/test/barrier_table_test.cc
index c4c5b229928..f540939c6fd 100644
--- a/paddle/fluid/distributed/test/barrier_table_test.cc
+++ b/paddle/fluid/distributed/test/barrier_table_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <ThreadPool.h>
+
 #include <unordered_map>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
diff --git a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
index f9d57be95af..c1467dae9a7 100644
--- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <unistd.h>
+
 #include <string>
 #include <thread>  // NOLINT
 
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index 29195d99857..bade56f239f 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <unistd.h>
+
 #include <string>
 #include <thread>  // NOLINT
 
diff --git a/paddle/fluid/distributed/test/brpc_utils_test.cc b/paddle/fluid/distributed/test/brpc_utils_test.cc
index 16ff9bd7584..33367bf16b7 100644
--- a/paddle/fluid/distributed/test/brpc_utils_test.cc
+++ b/paddle/fluid/distributed/test/brpc_utils_test.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/distributed/ps/service/brpc_utils.h"
+
 #include <string>
 
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/distributed/ps/service/brpc_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc
index 27b6ddf722b..51254391a42 100644
--- a/paddle/fluid/distributed/test/ctr_accessor_test.cc
+++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
+
 #include <cmath>
 #include <iostream>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
diff --git a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
index f6e773a414c..fbf179dbeee 100644
--- a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
+++ b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+
 #include <cmath>
 #include <iostream>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
diff --git a/paddle/fluid/distributed/test/dense_table_test.cc b/paddle/fluid/distributed/test/dense_table_test.cc
index 9529c776c12..185d9d3aed1 100644
--- a/paddle/fluid/distributed/test/dense_table_test.cc
+++ b/paddle/fluid/distributed/test/dense_table_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <ThreadPool.h>
+
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/memory_dense_table.h"
diff --git a/paddle/fluid/distributed/test/feature_value_test.cc b/paddle/fluid/distributed/test/feature_value_test.cc
index 32e3944d35a..6e848c3e2f4 100644
--- a/paddle/fluid/distributed/test/feature_value_test.cc
+++ b/paddle/fluid/distributed/test/feature_value_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
+
 #include <vector>
+
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
index 395d7c1eace..fa9b89d75c8 100644
--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <unistd.h>
+
 #include <condition_variable>  // NOLINT
 #include <fstream>
 #include <iomanip>
@@ -17,8 +18,8 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <unordered_set>
 #include <vector>
-#include "google/protobuf/text_format.h"
 
+#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 3b43c2779ee..9cb244a9ec4 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -9,7 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+
 #include <unistd.h>
+
 #include <condition_variable>  // NOLINT
 #include <fstream>
 #include <iomanip>
@@ -17,8 +20,8 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <unordered_set>
 #include <vector>
-#include "google/protobuf/text_format.h"
 
+#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
@@ -30,7 +33,6 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h"
 #include "paddle/fluid/distributed/ps/service/ps_service/service.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
-#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc
index d7f6f2f34d7..a3463162d27 100644
--- a/paddle/fluid/distributed/test/graph_table_sample_test.cc
+++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <unistd.h>
+
+#include <chrono>
 #include <condition_variable>  // NOLINT
 #include <fstream>
 #include <iomanip>
@@ -20,9 +22,8 @@
 #include <thread>  // NOLINT
 #include <unordered_set>
 #include <vector>
-#include "google/protobuf/text_format.h"
 
-#include <chrono>
+#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
diff --git a/paddle/fluid/distributed/test/memory_geo_table_test.cc b/paddle/fluid/distributed/test/memory_geo_table_test.cc
index ca3b51fade1..507211e69fa 100644
--- a/paddle/fluid/distributed/test/memory_geo_table_test.cc
+++ b/paddle/fluid/distributed/test/memory_geo_table_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <ThreadPool.h>
-
 #include <unistd.h>
+
 #include <string>
 #include <thread>  // NOLINT
 
diff --git a/paddle/fluid/distributed/test/memory_sparse_table_test.cc b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
index 68bc50373ff..1689b7716bb 100644
--- a/paddle/fluid/distributed/test/memory_sparse_table_test.cc
+++ b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
@@ -12,16 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <ThreadPool.h>
+#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
 
+#include <ThreadPool.h>
 #include <unistd.h>
+
 #include <string>
 #include <thread>  // NOLINT
 
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
-#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
 #include "paddle/fluid/distributed/ps/table/table.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
index 1a4e16b9266..3a9a8d0b39c 100644
--- a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
+++ b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
+
 #include <cmath>
 #include <iostream>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 
diff --git a/paddle/fluid/distributed/test/table_test.cc b/paddle/fluid/distributed/test/table_test.cc
index 4f73519ef5e..56809abad0c 100644
--- a/paddle/fluid/distributed/test/table_test.cc
+++ b/paddle/fluid/distributed/test/table_test.cc
@@ -30,4 +30,4 @@ TEST(Table, Initialize) {
   ASSERT_EQ(ret, -1);
 }
 }  // namespace distributed
-}  // // namespace paddle
+}  // namespace paddle
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 544e7c8fe85..09db68399f3 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -13,17 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
+
+#include "glog/logging.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
-
-#include "paddle/phi/api/all.h"
-#include "paddle/phi/core/dense_tensor.h"
-
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
-
-#include "glog/logging.h"
+#include "paddle/phi/api/all.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace egr {
 
@@ -72,8 +70,7 @@ paddle::small_vector<std::vector<paddle::experimental::Tensor>,
 GradNodeAccumulation::operator()(
     paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                          kSlotSmallVectorSize>& grads,  // NOLINT
-    bool create_graph,
-    bool is_new_grad) {
+    bool create_graph, bool is_new_grad) {
   VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 6374534578c..7694e290bab 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -41,8 +41,7 @@ class GradNodeAccumulation : public GradNodeBase {
                                kSlotSmallVectorSize>
   operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                   kSlotSmallVectorSize>& grads,  // NOLINT
-             bool create_graph = false,
-             bool is_new_grad = false) override;
+             bool create_graph = false, bool is_new_grad = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
diff --git a/paddle/fluid/eager/amp_utils.h b/paddle/fluid/eager/amp_utils.h
index 2145f4a1196..2834f7d5dc0 100644
--- a/paddle/fluid/eager/amp_utils.h
+++ b/paddle/fluid/eager/amp_utils.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 38f67cb5bdf..5adceb7e79a 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -13,16 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+
+#include "glog/logging.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/eager_tensor.h"
-
-#include "paddle/phi/kernels/scale_kernel.h"
-
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
-
-#include "glog/logging.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 
 namespace egr {
 
@@ -147,8 +145,7 @@ paddle::small_vector<std::vector<paddle::experimental::Tensor>,
 GradNodeScale::operator()(
     paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                          kSlotSmallVectorSize>& grads,  // NOLINT
-    bool create_graph,
-    bool is_new_grad) {
+    bool create_graph, bool is_new_grad) {
   // 1. Check Output Size
   VLOG(6) << "grad size is: " << grads.size();
   PADDLE_ENFORCE(
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index 04ff510944d..45872c97002 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -42,8 +42,7 @@ class GradNodeScale : public GradNodeBase {
                                kSlotSmallVectorSize>
   operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                   kSlotSmallVectorSize>& grads,  // NOLINT
-             bool create_graph = false,
-             bool is_new_grad = false) override;
+             bool create_graph = false, bool is_new_grad = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
index 7a374d567d5..836216d64b0 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
@@ -23,11 +23,11 @@
  * **/
 
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/scale.h"
+
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/utils.h"
-
 #include "paddle/phi/api/all.h"
 
 namespace egr {
diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index 3c18efea203..6a6a443f693 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -17,6 +17,7 @@
 
 #include <atomic>
 #include <memory>
+
 #include "paddle/fluid/eager/type_defs.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
@@ -73,8 +74,9 @@ class Controller {
     return op_meta_info_map_;
   }
 
-  void MergeOpMetaInfoMap(const std::unordered_map<
-                          std::string, std::vector<paddle::OpMetaInfo>>& map) {
+  void MergeOpMetaInfoMap(
+      const std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>&
+          map) {
     op_meta_info_map_.insert(map.begin(), map.end());
   }
 
diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc
index 8ee646b718c..6493135141f 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.cc
+++ b/paddle/fluid/eager/api/utils/hook_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/api/utils/hook_utils.h"
+
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index 81ea92d1c3c..84a9eb6dea6 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -13,17 +13,16 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
+
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/utils.h"
-
-#include "paddle/phi/api/all.h"
-
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/phi/api/all.h"
 
 namespace egr {
 namespace egr_utils_api {
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 9de647a21ad..36cfb4db113 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -13,28 +13,28 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/backward.h"
+
 #include <deque>
 
+#include "glog/logging.h"
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/grad_tensor_holder.h"
 #include "paddle/fluid/eager/utils.h"
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-#include "glog/logging.h"
-#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 
 namespace egr {
 
 /*
-* GeneralGrad is Helpper class to implement custom grad operation between
-* outputs and inputs.
-*
-* **/
+ * GeneralGrad is Helpper class to implement custom grad operation between
+ * outputs and inputs.
+ *
+ * **/
 class GeneralGrad {
  public:
   static GeneralGrad& Instance() { return *general_grad_; }
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index abdd8cadeed..3efcf3b21a4 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
+
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index af387bb3238..71ccb072ce9 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -13,27 +13,24 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/grad_node_info.h"
+
+#include "glog/logging.h"
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/utils.h"
-
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/var_type.h"
-
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
-
-#include "glog/logging.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
 
 /**
  * Implementation of GradNodeBase, Edge and GradTensorHolder.
-**/
+ **/
 namespace egr {
 
 static void CheckTensor(const paddle::experimental::Tensor& pre,
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 747e98b8466..9070ac9e5b6 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -179,14 +179,13 @@ class GradNodeBase {
                                kSlotSmallVectorSize>
   operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                   kSlotSmallVectorSize>& grads,  // NOLINT
-             bool create_graph = false,
-             bool is_new_grad = false) = 0;
+             bool create_graph = false, bool is_new_grad = false) = 0;
 
   virtual void ClearTensorWrappers() = 0;
 
   /**
-       * Self-Copy interface designed for use in DoubleGrad
-       * **/
+   * Self-Copy interface designed for use in DoubleGrad
+   * **/
   virtual std::shared_ptr<GradNodeBase> Copy() const = 0;
 
   // adj_edges were moved inside OutputMeta(), so no available direct access
@@ -230,8 +229,8 @@ class GradNodeBase {
                                std::shared_ptr<egr::TensorHook>&& hook);
 
   /**
-  * Remove GradientHook
-  * **/
+   * Remove GradientHook
+   * **/
   bool RemoveGradientHook(const int64_t& hook_id) {
     auto remove_cnt = gradient_hooks_.erase(hook_id);
     if (remove_cnt == 0) {
@@ -252,8 +251,8 @@ class GradNodeBase {
                                  kSlotSmallVectorSize>& tensors);
 
   /**
-    * Handle Complex - Real Type Promotion
-    * **/
+   * Handle Complex - Real Type Promotion
+   * **/
   void HandleComplexGradToRealGrad(
       paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                            kSlotSmallVectorSize>* out_grads);
@@ -262,8 +261,8 @@ class GradNodeBase {
   virtual std::string name() { return "GradNodeBase"; }
 
   /**
-       * The following interfaces are designed for no_need_buffer
-       * **/
+   * The following interfaces are designed for no_need_buffer
+   * **/
   bool IsTensorWrappersCleared() { return is_tensor_wrappers_cleared_; }
 
   void SetIsTensorWrappersCleared(bool is_tensor_wrappers_cleared) {
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 64fb8b53b47..6abf759cdba 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/grad_tensor_holder.h"
-#include "paddle/fluid/imperative/gradient_accumulator.h"
 
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/imperative/gradient_accumulator.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace egr {
diff --git a/paddle/fluid/eager/hooks.h b/paddle/fluid/eager/hooks.h
index 097150cf5ed..a98b3d9f8e4 100644
--- a/paddle/fluid/eager/hooks.h
+++ b/paddle/fluid/eager/hooks.h
@@ -18,6 +18,7 @@
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "paddle/phi/api/include/tensor.h"
 namespace egr {
 
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.cc b/paddle/fluid/eager/pylayer/py_layer_node.cc
index a00b292fe09..ec17a324b1e 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.cc
+++ b/paddle/fluid/eager/pylayer/py_layer_node.cc
@@ -13,18 +13,16 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/pylayer/py_layer_node.h"
-#include "paddle/fluid/eager/eager_tensor.h"
-
-#include "paddle/phi/api/all.h"
-#include "paddle/phi/core/dense_tensor.h"
 
+#include "glog/logging.h"
+#include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
-
-#include "glog/logging.h"
+#include "paddle/phi/api/all.h"
+#include "paddle/phi/core/dense_tensor.h"
 #pragma GCC diagnostic ignored "-Wattributes"
 #include "pybind11/pytypes.h"
 
@@ -34,8 +32,7 @@ paddle::small_vector<std::vector<paddle::experimental::Tensor>,
 GradNodePyLayer::operator()(
     paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                          kSlotSmallVectorSize>& grads,  // NOLINT
-    bool create_graph,
-    bool is_new_grad) {
+    bool create_graph, bool is_new_grad) {
   VLOG(3) << "Running Eager Backward Node: " << name();
 
   paddle::small_vector<std::vector<paddle::experimental::Tensor>,
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h
index c1a8c6e626b..998480bbfeb 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.h
+++ b/paddle/fluid/eager/pylayer/py_layer_node.h
@@ -38,8 +38,7 @@ class GradNodePyLayer : public GradNodeBase {
                                kSlotSmallVectorSize>
   operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                   kSlotSmallVectorSize>& grads,  // NOLINT
-             bool create_graph = false,
-             bool is_new_grad = false) override;
+             bool create_graph = false, bool is_new_grad = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
index c159084d683..c53ffe823ab 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+
 #include <sstream>
 
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/utils/hook_utils.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
diff --git a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc b/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
index 48b4b9c5748..f7415dd1f71 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/autograd_meta.h"
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index edbb441f27a..a82965303af 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/eager_tensor.h"
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/common/layout.h"
@@ -35,7 +35,7 @@ class AutogradMetaTest : public AbstractAutogradMeta {
   explicit AutogradMetaTest(int val) : val_(val) {}
   int val_ = 0;
 };
-}
+}  // namespace eager_test
 TEST(Tensor, Constructor) {
   paddle::experimental::Tensor et1 = paddle::experimental::Tensor();
   paddle::experimental::Tensor et2 = paddle::experimental::Tensor("et2");
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index 6687b6621ad..63a4a72b631 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/grad_node_info.h"
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
-#include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
@@ -85,8 +85,8 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
   CHECK_EQ(grad_test_node2->OutputMeta()[0].size(), size_t(1));
 
   VLOG(6) << "Test Gradient Hook";
-  auto gradient_hook = [](
-      const paddle::experimental::Tensor& et) -> paddle::experimental::Tensor {
+  auto gradient_hook = [](const paddle::experimental::Tensor& et)
+      -> paddle::experimental::Tensor {
     paddle::experimental::Tensor res;
     phi::DenseTensorMeta meta =
         phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index a00e629d102..eb9bd6007bf 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -14,7 +14,6 @@
 #pragma once
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
@@ -35,8 +34,7 @@ class GradTestNode : public egr::GradNodeBase {
                        egr::kSlotSmallVectorSize>
   operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                   egr::kSlotSmallVectorSize>& grads,  // NOLINT
-             bool create_graph = false,
-             bool is_new_grad = false) override {
+             bool create_graph = false, bool is_new_grad = false) override {
     val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
                ->data<float>()[0];
     phi::DenseTensorMeta meta =
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index 0fe349294b4..17f593e2490 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -12,17 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/grad_tensor_holder.h"
+
 #include <sstream>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
-#include "paddle/fluid/eager/grad_tensor_holder.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/core/selected_rows.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/selected_rows.h"
 
 PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
index 28c3472f90d..8813f364840 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/tensor_wrapper.h"
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/eager/tensor_wrapper.h"
 #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
 #include "paddle/fluid/eager/utils.h"
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index 056c7102f66..3b0e6a3fdb6 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -15,19 +15,17 @@
 // Eager Dygraph
 
 #include <paddle/fluid/framework/op_registry.h>
+
 #include <chrono>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/platform/flags.h"
-
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
-
-#include "paddle/fluid/imperative/tracer.h"
-
 #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/platform/flags.h"
 
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 287d6e770de..5dd5cde548f 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -14,19 +14,17 @@
 
 // Eager Dygraph
 #include <paddle/fluid/framework/op_registry.h>
+
 #include <chrono>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/platform/flags.h"
-
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
-
-#include "paddle/fluid/imperative/tracer.h"
-
 #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/platform/flags.h"
 
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index b4b47a85f66..bf1d955b900 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -23,7 +23,6 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/fluid/imperative/basic_engine.h"
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index d9afd7cc965..0cd33a72e1a 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -23,7 +23,6 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/fluid/imperative/basic_engine.h"
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
index 86bf13707ed..5b37e973f1d 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <math.h>
+
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/phi/api/all.h"
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index 7552ad83fa2..c6d4514fa8e 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -12,25 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/backward.h"
+
 #include <sstream>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
-#include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
-
-#include "paddle/fluid/eager/api/all.h"
-
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_meta.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 4337c0d092c..847c082a301 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -16,22 +16,17 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/grad_node_info.h"
-
-#include "paddle/fluid/eager/api/all.h"
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_meta.h"
-
 #include "paddle/fluid/eager/tests/test_utils.h"
-
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index 551262d259e..e4ca8dd164b 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -15,14 +15,12 @@
 #include <sstream>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/fluid/eager/utils.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
index 4cb316380aa..ebf396bebfa 100644
--- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
@@ -16,18 +16,15 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
-
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_meta.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index 1f8fdb7de0c..a4da315f44a 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -16,21 +16,17 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/grad_node_info.h"
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_meta.h"
-
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
-
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index 3c237b76e64..b53cdf55d43 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -17,17 +17,14 @@
 #include <chrono>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
-#include "paddle/fluid/eager/utils.h"
-
 #include "paddle/fluid/eager/tests/test_utils.h"
+#include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/imperative/tracer.h"
-
-#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc
index 72a94b40ed7..8d6c4d7843f 100644
--- a/paddle/fluid/eager/tests/task_tests/grad_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc
@@ -16,17 +16,14 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
-
-#include "paddle/fluid/eager/api/all.h"
-
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index d7b887b28bd..badbe871597 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -16,22 +16,17 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/grad_node_info.h"
-
-#include "paddle/fluid/eager/api/all.h"
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_meta.h"
-
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
-
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index c4d4ff91106..dbe2c138945 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -15,16 +15,14 @@
 #include <sstream>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/phi/core/dense_tensor.h"
-
-#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
-#include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
index be0563fbeed..73d213f7114 100644
--- a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/nan_inf_utils.h"
+
 #include <iostream>
 #include <limits>
 #include <tuple>
 
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/eager/nan_inf_utils.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/api/include/api.h"
diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
index 24e5da06011..aeddeb6fae7 100644
--- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
@@ -12,17 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+
 #include <sstream>
 
 #include "gtest/gtest.h"
-
-#include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/grad_tensor_holder.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/test_utils.h b/paddle/fluid/eager/tests/test_utils.h
index 47bfe9a7cab..cb1e531d82d 100644
--- a/paddle/fluid/eager/tests/test_utils.h
+++ b/paddle/fluid/eager/tests/test_utils.h
@@ -18,14 +18,12 @@
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/utils.h"
-
-#include "paddle/phi/api/all.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_meta.h"
-
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/phi/api/all.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/tensor_meta.h"
 
 namespace eager_test {
 
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 5a730e4dbf1..3254b3bf892 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -17,7 +17,6 @@
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tensor_wrapper.h"
-
 #include "paddle/fluid/operators/run_program_op.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -273,7 +272,7 @@ inline void RunProgramGradAPI(
     const paddle::framework::AttributeMap &attrs,
     std::vector<paddle::experimental::Tensor *> &x_grad,      // NOLINT
     std::vector<paddle::experimental::Tensor *> &params_grad  // NOLINT
-    ) {
+) {
   // if all output vars are set to stop_gradient, grad op no need to executed
   if (x_grad.empty() && params_grad.empty()) return;
 
@@ -368,8 +367,7 @@ class GradNodeRunProgram : public egr::GradNodeBase {
                                egr::kSlotSmallVectorSize>
   operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                   egr::kSlotSmallVectorSize> &grads,  // NOLINT
-             bool create_graph,
-             bool is_new_grad) override {
+             bool create_graph, bool is_new_grad) override {
     VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
     paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                          egr::kSlotSmallVectorSize>
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index c6389e99831..783afcc1e2c 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -18,7 +18,6 @@
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
-
 #include "paddle/phi/api/all.h"
 
 namespace egr {
@@ -161,10 +160,11 @@ class EagerUtils {
     if (require_any_grad && autograd_meta) {
       PADDLE_ENFORCE_EQ(!autograd_meta->StopGradient() &&
                             egr::egr_utils_api::IsLeafTensor(target),
-                        false, paddle::platform::errors::InvalidArgument(
-                                   "Leaf Var (%s) that doesn't stop gradient "
-                                   "can't use inplace strategy.",
-                                   target.name()));
+                        false,
+                        paddle::platform::errors::InvalidArgument(
+                            "Leaf Var (%s) that doesn't stop gradient "
+                            "can't use inplace strategy.",
+                            target.name()));
     }
   }
 
@@ -234,8 +234,8 @@ class EagerUtils {
       const paddle::experimental::Tensor& tensor);
 
   /**
-    * Fill Zero
-    * **/
+   * Fill Zero
+   * **/
   static void FillZeroForEmptyOptionalGradInput(
       std::vector<paddle::experimental::Tensor>* in_grads,
       const std::vector<GradSlotMeta>& grad_in_metas);
diff --git a/paddle/fluid/framework/archive.h b/paddle/fluid/framework/archive.h
index d0589383863..6a8f4ff47f3 100644
--- a/paddle/fluid/framework/archive.h
+++ b/paddle/fluid/framework/archive.h
@@ -20,6 +20,7 @@
 #endif
 
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <map>
 #include <memory>
@@ -31,6 +32,7 @@
 #include <utility>
 #include <valarray>
 #include <vector>
+
 #include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index ae3d8379bdb..d6cc5dc639f 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/async_executor.h"
+
+#include "gflags/gflags.h"
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/message.h"
 #include "google/protobuf/text_format.h"
-
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/executor_thread_worker.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h
index b0c6c8a0164..01daf3c1118 100644
--- a/paddle/fluid/framework/async_executor.h
+++ b/paddle/fluid/framework/async_executor.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <time.h>
+
 #include <map>
 #include <memory>
 #include <mutex>   // NOLINT
@@ -24,6 +25,7 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <typeinfo>
 #include <vector>
+
 #include "paddle/fluid/framework/data_feed.pb.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/executor.h"
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index 2164a21f3f8..b2c5bfde3aa 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include <functional>
 #include <iosfwd>
 #include <string>
diff --git a/paddle/fluid/framework/attribute_test.cc b/paddle/fluid/framework/attribute_test.cc
index 27a6afb49f5..8a47e41d383 100644
--- a/paddle/fluid/framework/attribute_test.cc
+++ b/paddle/fluid/framework/attribute_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/attribute.h"
+
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/program_desc.h"
-
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/utils/any.h"
 
 TEST(Attribute, GetAttrValueToAny) {
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 80fee94f1c8..1eb3585fa33 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -20,6 +20,7 @@
 #endif
 
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <condition_variable>  // NOLINT
 #include <deque>
@@ -28,6 +29,7 @@
 #include <mutex>  // NOLINT
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/expect.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/convert_utils_test.cc b/paddle/fluid/framework/convert_utils_test.cc
index 140806dfd7c..e3f5a4a8dcd 100644
--- a/paddle/fluid/framework/convert_utils_test.cc
+++ b/paddle/fluid/framework/convert_utils_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
+
 #include "gtest/gtest.h"
 
 namespace phi {
diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc
index d8c27ad280d..d4f36be5e87 100644
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <sys/types.h>
+
 #include <random>
 
 #include "gflags/gflags.h"
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 65c41e19ac4..0130fd4b57f 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -867,43 +867,43 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
     bool is_double_grad = (i == 2);
 
     // GradOpDescMaker
-    info.grad_op_maker_ = [grad_op_name, grad_op_inputs, grad_op_outputs,
-                           is_double_grad](
-        const OpDesc& fwd_op,
-        const std::unordered_set<std::string>& no_grad_set,
-        std::unordered_map<std::string, std::string>* grad_to_var,
-        const std::vector<BlockDesc*>& grad_block) {
-      CustomGradOpMaker<paddle::framework::OpDesc> maker(
-          fwd_op, no_grad_set, grad_to_var, grad_block, grad_op_name,
-          grad_op_inputs, grad_op_outputs, is_double_grad);
-      return maker();
-    };
+    info.grad_op_maker_ =
+        [grad_op_name, grad_op_inputs, grad_op_outputs, is_double_grad](
+            const OpDesc& fwd_op,
+            const std::unordered_set<std::string>& no_grad_set,
+            std::unordered_map<std::string, std::string>* grad_to_var,
+            const std::vector<BlockDesc*>& grad_block) {
+          CustomGradOpMaker<paddle::framework::OpDesc> maker(
+              fwd_op, no_grad_set, grad_to_var, grad_block, grad_op_name,
+              grad_op_inputs, grad_op_outputs, is_double_grad);
+          return maker();
+        };
 
     // GradOpBaseMaker
-    info.dygraph_grad_op_maker_ = [grad_op_name, grad_op_inputs,
-                                   grad_op_outputs, is_double_grad](
-        const std::string& type,
-        const imperative::NameVarBaseMap& var_base_map_in,
-        const imperative::NameVarBaseMap& var_base_map_out,
-        const framework::AttributeMap& attrs,
-        const framework::AttributeMap& default_attrs,
-        const std::map<std::string, std::string>& inplace_map) {
-      CustomGradOpMaker<paddle::imperative::OpBase> maker(
-          type, var_base_map_in, var_base_map_out, attrs, inplace_map,
-          grad_op_name, grad_op_inputs, grad_op_outputs, is_double_grad);
-      maker.SetDygraphDefaultAttrsMap(default_attrs);
-      return maker();
-    };
+    info.dygraph_grad_op_maker_ =
+        [grad_op_name, grad_op_inputs, grad_op_outputs, is_double_grad](
+            const std::string& type,
+            const imperative::NameVarBaseMap& var_base_map_in,
+            const imperative::NameVarBaseMap& var_base_map_out,
+            const framework::AttributeMap& attrs,
+            const framework::AttributeMap& default_attrs,
+            const std::map<std::string, std::string>& inplace_map) {
+          CustomGradOpMaker<paddle::imperative::OpBase> maker(
+              type, var_base_map_in, var_base_map_out, attrs, inplace_map,
+              grad_op_name, grad_op_inputs, grad_op_outputs, is_double_grad);
+          maker.SetDygraphDefaultAttrsMap(default_attrs);
+          return maker();
+        };
 
     /* Grad op register */
     OpInfo grad_info;
 
     // Grad Op
-    grad_info.creator_ = [](
-        const std::string& type, const VariableNameMap& inputs,
-        const VariableNameMap& outputs, const AttributeMap& attrs) {
-      return new CustomOperator(type, inputs, outputs, attrs);
-    };
+    grad_info.creator_ =
+        [](const std::string& type, const VariableNameMap& inputs,
+           const VariableNameMap& outputs, const AttributeMap& attrs) {
+          return new CustomOperator(type, inputs, outputs, attrs);
+        };
 
     // Grad InferShape
     if (grad_infer_shape_fn == nullptr) {
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index 4757eb60f43..d51707970ff 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -13,18 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/framework/phi_utils.h"
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
old mode 100755
new mode 100644
index 0801aa0e56a..1808caddabc
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/framework/data_feed.h"
+
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #ifdef _LINUX
 #include <stdio_ext.h>
@@ -231,8 +232,9 @@ bool DataFeed::PickOneFile(std::string* filename) {
 }
 
 void DataFeed::CheckInit() {
-  PADDLE_ENFORCE_EQ(finish_init_, true, platform::errors::PreconditionNotMet(
-                                            "DataFeed initialization failed."));
+  PADDLE_ENFORCE_EQ(
+      finish_init_, true,
+      platform::errors::PreconditionNotMet("DataFeed initialization failed."));
 }
 
 void DataFeed::CheckSetFileList() {
@@ -1619,9 +1621,10 @@ template class PrivateInstantDataFeed<std::vector<MultiSlotType>>;
 bool MultiSlotFileInstantDataFeed::Preprocess(const std::string& filename) {
   fd_ = open(filename.c_str(), O_RDONLY);
   PADDLE_ENFORCE_NE(
-      fd_, -1, platform::errors::Unavailable(
-                   "Fail to open file: %s in MultiSlotFileInstantDataFeed.",
-                   filename.c_str()));
+      fd_, -1,
+      platform::errors::Unavailable(
+          "Fail to open file: %s in MultiSlotFileInstantDataFeed.",
+          filename.c_str()));
 
   struct stat sb;
   fstat(fd_, &sb);
@@ -2182,7 +2185,7 @@ void SlotRecordInMemoryDataFeed::LoadIntoMemoryByLine(void) {
     SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE);
     // get slotrecord object function
     auto record_func = [this, &offset, &record_vec, &old_offset](
-        std::vector<SlotRecord>& vec, int num) {
+                           std::vector<SlotRecord>& vec, int num) {
       vec.resize(num);
       if (offset + num > OBJPOOL_BLOCK_SIZE) {
         input_channel_->WriteMove(offset, &record_vec[0]);
@@ -2675,8 +2678,8 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) {
     size_t* off_start_ptr = &offsets[j * offset_cols_size];
 
     int total_instance = static_cast<int>(off_start_ptr[offset_cols_size - 1]);
-    CHECK(total_instance >= 0) << "slot idx:" << j
-                               << ", total instance:" << total_instance;
+    CHECK(total_instance >= 0)
+        << "slot idx:" << j << ", total instance:" << total_instance;
     auto& info = used_slots_info_[j];
 
     // fill slot value with default value 0
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
index e46e4aeb012..e058b194690 100644
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed_factory.h"
 
 #include <stdlib.h>
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc
index 2cc441bbd34..8375ed80e83 100644
--- a/paddle/fluid/framework/data_feed_test.cc
+++ b/paddle/fluid/framework/data_feed_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/data_feed.h"
+
 #include <fcntl.h>
+
 #include <chrono>  // NOLINT
 #include <fstream>
 #include <iostream>
@@ -23,6 +25,7 @@
 #include <thread>  // NOLINT
 #include <utility>
 #include <vector>
+
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index 0c762ab2e77..f89d0f969ab 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -13,6 +13,7 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/framework/data_set.h"
+
 #include "google/protobuf/text_format.h"
 #if (defined PADDLE_WITH_DISTRIBUTE) && (defined PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/index_dataset/index_sampler.h"
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 3d096eaebe3..5d961841a25 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <ThreadPool.h>
+
 #include <fstream>
 #include <memory>
 #include <mutex>  // NOLINT
@@ -26,6 +27,7 @@
 #include <vector>
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/broadcast.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 15cf30c1cf3..01802c11d52 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -44,8 +44,8 @@ TEST(DataType, float16) {
 
 TEST(DataType, bfloat16) {
   using paddle::framework::Tensor;
-  using paddle::platform::CPUPlace;
   using paddle::platform::bfloat16;
+  using paddle::platform::CPUPlace;
   namespace f = paddle::framework;
   f::proto::VarType::Type dtype = f::proto::VarType::BF16;
 
diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
index 4fab3a78454..3420298297b 100644
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/tensor_util.h"
 
-#include "gtest/gtest.h"
-
 TEST(DataTypeTransform, GPUTransform) {
   auto cpu_place = paddle::platform::CPUPlace();
   auto gpu_place = paddle::platform::CUDAPlace(0);
diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
index 75baf15dc5e..ebdf66cdde1 100644
--- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h"
+
 #include <deque>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h
index 5e973f13cc6..c907a4b4afc 100644
--- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h
@@ -14,12 +14,14 @@
 
 #pragma once
 #include <ThreadPool.h>
+
 #include <condition_variable>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
diff --git a/paddle/fluid/framework/details/bkcl_op_handle.h b/paddle/fluid/framework/details/bkcl_op_handle.h
index 1a098f06f08..b0c2275b3a5 100644
--- a/paddle/fluid/framework/details/bkcl_op_handle.h
+++ b/paddle/fluid/framework/details/bkcl_op_handle.h
@@ -14,8 +14,6 @@
 
 #pragma once
 
-#include "xpu/bkcl.h"
-
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -24,6 +22,7 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
+#include "xpu/bkcl.h"
 
 DECLARE_bool(sync_bkcl_allreduce);
 
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index fdf74d2f769..9ed76c87d84 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/build_strategy.h"
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/ir/graph_printer.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
diff --git a/paddle/fluid/framework/details/build_strategy_test.cc b/paddle/fluid/framework/details/build_strategy_test.cc
index 69af77d23fb..1914c1d33de 100644
--- a/paddle/fluid/framework/details/build_strategy_test.cc
+++ b/paddle/fluid/framework/details/build_strategy_test.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/details/build_strategy.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
@@ -23,8 +25,6 @@
 #include "gtest/gtest-test-part.h"
 #include "gtest/gtest.h"
 #include "gtest/gtest_pred_impl.h"
-
-#include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc
index 5b055d7cb4d..b440da9f1df 100644
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/fluid/framework/details/cow_ptr.h"
+
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 7f51de435ba..57440ed9aa2 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <cstddef>  // for size_t
+
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index 4477702900a..19b00615715 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -14,10 +14,12 @@
 
 #pragma once
 #include <ThreadPool.h>
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index f4ca4907d48..7f44e68af6b 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -325,9 +325,10 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
 
     PADDLE_ENFORCE_EQ(
         platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)),
-        true, platform::errors::InvalidArgument(
-                  "The variable '%s' at scope %d is not in the right place.",
-                  var_name, scope_idx));
+        true,
+        platform::errors::InvalidArgument(
+            "The variable '%s' at scope %d is not in the right place.",
+            var_name, scope_idx));
     grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
   }
 }
@@ -356,10 +357,11 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
     // Get element number
     int64_t len = grad_tensor.at(i).second->numel();
     PADDLE_ENFORCE_GT(
-        len, 0, platform::errors::InvalidArgument(
-                    "The size of grad tensors of fused_all_reduce_op_handle  "
-                    "must be > 0, but got %d.",
-                    len));
+        len, 0,
+        platform::errors::InvalidArgument(
+            "The size of grad tensors of fused_all_reduce_op_handle  "
+            "must be > 0, but got %d.",
+            len));
     *numel +=
         platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
   }
diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
index 44b9ca90fc5..18de9f443a7 100644
--- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h"
+
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h
index d139f848830..08d9c999a8a 100644
--- a/paddle/fluid/framework/details/graph_test_base.h
+++ b/paddle/fluid/framework/details/graph_test_base.h
@@ -18,6 +18,7 @@
 #include <iostream>
 #include <iterator>
 #include <string>
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph.h"
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index e6790de92d0..7b93baddb4a 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
+
+#include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/scope.h"
 
@@ -261,7 +262,7 @@ void CheckNanInf<paddle::platform::complex<float>>(
 }
 
 template <>
-    void CheckNanInf<paddle::platform::complex<double>>>
+    void CheckNanInf < paddle::platform::complex < double >>>
     (const paddle::platform::complex<double>* value, const size_t numel,
      int print_num, const std::string& op_type, const std::string& var_name) {
   double real_sum = 0.0;
@@ -563,8 +564,9 @@ static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
 
   if (sum >= 1.0) PrintNPUOpValueInfo(op, scope, place);
 
-  PADDLE_ENFORCE_LT(sum, 1.0, platform::errors::PreconditionNotMet(
-                                  "Operator %s contains Nan/Inf.", op.Type()));
+  PADDLE_ENFORCE_LT(sum, 1.0,
+                    platform::errors::PreconditionNotMet(
+                        "Operator %s contains Nan/Inf.", op.Type()));
 }
 #endif
 
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index 7cf11f7829d..b8b5537c93c 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -12,15 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/nan_inf_utils.h"
-#include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
-
 #include <algorithm>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/details/nan_inf_utils.h"
+#include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 427b981e7cd..213d7033764 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -213,14 +213,14 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
         platform::errors::AlreadyExists(
             "GradOpDescMaker of %s has been registered", op_type));
 
-    info->grad_op_maker_ = [](
-        const OpDesc& fwd_op,
-        const std::unordered_set<std::string>& no_grad_set,
-        std::unordered_map<std::string, std::string>* grad_to_var,
-        const std::vector<BlockDesc*>& grad_block) {
-      T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
-      return maker();
-    };
+    info->grad_op_maker_ =
+        [](const OpDesc& fwd_op,
+           const std::unordered_set<std::string>& no_grad_set,
+           std::unordered_map<std::string, std::string>* grad_to_var,
+           const std::vector<BlockDesc*>& grad_block) {
+          T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
+          return maker();
+        };
 
     info->use_default_grad_op_desc_maker_ =
         std::is_base_of<DefaultGradOpMaker<OpDesc, true>, T>::value ||
@@ -244,17 +244,17 @@ struct OpInfoFiller<T, kGradOpBaseMaker> {
         platform::errors::AlreadyExists(
             "GradOpBaseMaker of %s has been registered", op_type));
 
-    info->dygraph_grad_op_maker_ = [](
-        const std::string& type,
-        const imperative::NameVarBaseMap& var_base_map_in,
-        const imperative::NameVarBaseMap& var_base_map_out,
-        const framework::AttributeMap& attrs,
-        const framework::AttributeMap& default_attrs,
-        const std::map<std::string, std::string>& inplace_map) {
-      T maker(type, var_base_map_in, var_base_map_out, attrs, inplace_map);
-      maker.SetDygraphDefaultAttrsMap(default_attrs);
-      return maker();
-    };
+    info->dygraph_grad_op_maker_ =
+        [](const std::string& type,
+           const imperative::NameVarBaseMap& var_base_map_in,
+           const imperative::NameVarBaseMap& var_base_map_out,
+           const framework::AttributeMap& attrs,
+           const framework::AttributeMap& default_attrs,
+           const std::map<std::string, std::string>& inplace_map) {
+          T maker(type, var_base_map_in, var_base_map_out, attrs, inplace_map);
+          maker.SetDygraphDefaultAttrsMap(default_attrs);
+          return maker();
+        };
   }
 };
 
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 936e84a6c82..22c27fe86f1 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -90,10 +90,9 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
     const std::vector<platform::Place> &places, ir::Graph *graph)
     // TODO(Yancey1989): Copying graphs is not safely since it deleted the
     // attrs.
-    : ParallelSSAGraphExecutor(strategy, local_scopes, local_exec_scopes,
-                               places,
-                               SeparateMultiDevicesGraph(graph,
-                                                         places.size())) {}
+    : ParallelSSAGraphExecutor(
+          strategy, local_scopes, local_exec_scopes, places,
+          SeparateMultiDevicesGraph(graph, places.size())) {}
 
 ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index d9d83efcb8e..88c8b1cbfb2 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 2ae3880ab3c..799005e4b09 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -245,14 +245,15 @@ void ReduceOpHandle::RunImpl() {
         int type = platform::ToBKCLDataType(
             framework::TransToProtoVarType(lod_tensor.dtype()));
         size_t numel = static_cast<size_t>(lod_tensor.numel());
-        all_reduce_calls.emplace_back([buffer, recvbuffer, type, numel, root_id,
-                                       &bkcl_ctx] {
-          PADDLE_ENFORCE_EQ(bkcl_reduce(bkcl_ctx.comm(), buffer, recvbuffer,
-                                        numel, static_cast<BKCLDataType>(type),
-                                        BKCL_ADD, root_id, nullptr),
-                            BKCL_SUCCESS, platform::errors::Unavailable(
-                                              "bkcl_all_reduce failed"));
-        });
+        all_reduce_calls.emplace_back(
+            [buffer, recvbuffer, type, numel, root_id, &bkcl_ctx] {
+              PADDLE_ENFORCE_EQ(
+                  bkcl_reduce(bkcl_ctx.comm(), buffer, recvbuffer, numel,
+                              static_cast<BKCLDataType>(type), BKCL_ADD,
+                              root_id, nullptr),
+                  BKCL_SUCCESS,
+                  platform::errors::Unavailable("bkcl_all_reduce failed"));
+            });
       }
 
       WaitInputVarGenerated();
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index 39bcf1d0f38..35373e1a709 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc
index 57faf0e75ba..bd1a4378f07 100644
--- a/paddle/fluid/framework/details/scope_buffered_monitor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scope_buffered_monitor.h"
+
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index ea5a3c07957..091224f1e59 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <ThreadPool.h>
+
 #include <deque>
 #include <list>
 #include <memory>
@@ -21,6 +22,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/scope_buffered_monitor.h"
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index 7e63c5ffb9a..28a5c31f644 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -41,8 +41,9 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
       is_encoded_(is_encoded),
       nranks_(nranks) {
   // TODO(gongwb) :polish them!
-  PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument(
-                                          "The argument is_encoded is false."));
+  PADDLE_ENFORCE_EQ(
+      is_encoded, true,
+      platform::errors::InvalidArgument("The argument is_encoded is false."));
   VLOG(1) << "Use dgc allreduce mode"
           << ", nranks:" << nranks_;
 
@@ -193,11 +194,12 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
 
     sparse_reduce_calls.emplace_back([=] {
       platform::CUDADeviceGuard guard(dev_id);
-      PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce(
-                            gather_buff, k, out_tensor_buf,
-                            static_cast<int>(out_numel), nranks_, stream),
-                        true, platform::errors::Unavailable(
-                                  "Calling sparseReduce() failed."));
+      PADDLE_ENFORCE_EQ(
+          paddle::communication::dgc::sparseReduce(
+              gather_buff, k, out_tensor_buf, static_cast<int>(out_numel),
+              nranks_, stream),
+          true,
+          platform::errors::Unavailable("Calling sparseReduce() failed."));
     });
   }
 
diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc
index 88026143683..56cd12f5001 100644
--- a/paddle/fluid/framework/device_worker.cc
+++ b/paddle/fluid/framework/device_worker.cc
@@ -190,9 +190,10 @@ void DeviceWorker::DumpField(const Scope& scope, int dump_mode,
       tensor = &cpu_tensor;
     }
     if (!CheckValidOutput(tensor, batch_size)) {
-      VLOG(0) << "Note: field[" << field << "] cannot pass check, so it was "
-                                            "skipped. Maybe the dimension is "
-                                            "wrong ";
+      VLOG(0) << "Note: field[" << field
+              << "] cannot pass check, so it was "
+                 "skipped. Maybe the dimension is "
+                 "wrong ";
       continue;
     }
     for (size_t i = 0; i < batch_size; ++i) {
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index e6635a2f941..c973afd1560 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker_factory.h"
 
 #include <stdlib.h>
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 20d08ef18ae..7e1f740bcc2 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/dlpack_tensor.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 829908bd982..6c19cf3450d 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/dlpack_tensor.h"
+
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
@@ -39,7 +40,7 @@ constexpr uint8_t GetDLDataTypeCode() {
                     : (std::is_integral<T>::value ? static_cast<uint8_t>(kDLInt)
                                                   : static_cast<uint8_t>(-1)));
 }
-}  // NOLINT
+}  // namespace
 
 template <typename T>
 void TestMain(const platform::Place &place, uint16_t lanes) {
diff --git a/paddle/fluid/framework/downpour_lite_worker.cc b/paddle/fluid/framework/downpour_lite_worker.cc
index 7344c93ef06..8ceffe58dcf 100644
--- a/paddle/fluid/framework/downpour_lite_worker.cc
+++ b/paddle/fluid/framework/downpour_lite_worker.cc
@@ -202,15 +202,15 @@ void DownpourLiteWorker::CopyDenseVars() {
     Variable* src_var = thread_scope_->FindVar(src_var_name);
     CHECK(src_var != nullptr) << src_var_name << " not found";  // NOLINT
     LoDTensor* src_tensor = src_var->GetMutable<LoDTensor>();
-    CHECK(src_tensor != nullptr) << src_var_name
-                                 << " tensor is null";  // NOLINT
+    CHECK(src_tensor != nullptr)
+        << src_var_name << " tensor is null";  // NOLINT
     float* src_data = src_tensor->data<float>();
 
     Variable* dest_var = thread_scope_->FindVar(dest_var_name);
     CHECK(dest_var != nullptr) << dest_var_name << " not found";  // NOLINT
     LoDTensor* dest_tensor = dest_var->GetMutable<LoDTensor>();
-    CHECK(dest_tensor != nullptr) << dest_var_name
-                                  << " tensor is null";  // NOLINT
+    CHECK(dest_tensor != nullptr)
+        << dest_var_name << " tensor is null";  // NOLINT
     float* dest_data = dest_tensor->data<float>();
 
     CHECK(src_tensor->numel() == dest_tensor->numel())
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 06c3d18af84..c14b48ef8a7 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -155,8 +155,8 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) {
       continue;
     }
     LoDTensor* tensor = fea_var->GetMutable<LoDTensor>();
-    CHECK(tensor != nullptr) << "tensor of var "
-                             << sparse_key_names_[table_id][i] << " is null";
+    CHECK(tensor != nullptr)
+        << "tensor of var " << sparse_key_names_[table_id][i] << " is null";
 
     // skip slots which do not have embedding
     Variable* emb_var =
@@ -309,9 +309,9 @@ void DownpourWorker::AdjustInsWeight() {
   float* ins_weights = ins_weight_tensor->data<float>();
   size_t len = ins_weight_tensor->numel();  // len = batch size
   // here we assume nid_show slot only has one feasign in each instance
-  CHECK(len == nid_show_.size()) << "ins_weight size should be equal to "
-                                 << "nid_show size, " << len << " vs "
-                                 << nid_show_.size();
+  CHECK(len == nid_show_.size())
+      << "ins_weight size should be equal to "
+      << "nid_show size, " << len << " vs " << nid_show_.size();
   float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold();
   float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio();
   int64_t nid_adjw_num = 0;
@@ -326,9 +326,8 @@ void DownpourWorker::AdjustInsWeight() {
     }
     float ins_weight = 1.0;
     if (nid_show >= 0 && nid_show < nid_adjw_threshold) {
-      ins_weight = log(M_E +
-                       (nid_adjw_threshold - nid_show) / nid_adjw_threshold *
-                           nid_adjw_ratio);
+      ins_weight = log(M_E + (nid_adjw_threshold - nid_show) /
+                                 nid_adjw_threshold * nid_adjw_ratio);
       // count nid adjw insnum and weight
       ++nid_adjw_num;
       nid_adjw_weight += ins_weight;
@@ -423,15 +422,15 @@ void DownpourWorker::CopyDenseVars() {
     Variable* src_var = thread_scope_->FindVar(src_var_name);
     CHECK(src_var != nullptr) << src_var_name << " not found";  // NOLINT
     LoDTensor* src_tensor = src_var->GetMutable<LoDTensor>();
-    CHECK(src_tensor != nullptr) << src_var_name
-                                 << " tensor is null";  // NOLINT
+    CHECK(src_tensor != nullptr)
+        << src_var_name << " tensor is null";  // NOLINT
     float* src_data = src_tensor->data<float>();
 
     Variable* dest_var = thread_scope_->FindVar(dest_var_name);
     CHECK(dest_var != nullptr) << dest_var_name << " not found";  // NOLINT
     LoDTensor* dest_tensor = dest_var->GetMutable<LoDTensor>();
-    CHECK(dest_tensor != nullptr) << dest_var_name
-                                  << " tensor is null";  // NOLINT
+    CHECK(dest_tensor != nullptr)
+        << dest_var_name << " tensor is null";  // NOLINT
     float* dest_data = dest_tensor->data<float>();
 
     CHECK(src_tensor->numel() == dest_tensor->numel())
diff --git a/paddle/fluid/framework/eigen_test.cc b/paddle/fluid/framework/eigen_test.cc
index 43d5f9ea0e8..4e214bd36f3 100644
--- a/paddle/fluid/framework/eigen_test.cc
+++ b/paddle/fluid/framework/eigen_test.cc
@@ -13,10 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/phi/core/ddim.h"
 
 #include <gtest/gtest.h>
 
+#include "paddle/phi/core/ddim.h"
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 06ce9712f5c..830bbacb639 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/executor.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/trainer_desc.pb.h"
 #include "paddle/fluid/framework/trainer_factory.h"
@@ -585,8 +587,9 @@ void Executor::RunPreparedContext(
           "Program in ExecutorPrepareContext should has feed_ops."));
   PADDLE_ENFORCE_EQ(
       has_fetch_operators(global_block, *fetch_targets, fetch_holder_name),
-      true, platform::errors::PreconditionNotMet(
-                "Program in the prepared context should has fetch_ops."));
+      true,
+      platform::errors::PreconditionNotMet(
+          "Program in the prepared context should has fetch_ops."));
 
   // map the data of feed_targets to feed_holder
   for (auto* op : global_block.AllOps()) {
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 50a41cb5611..468b3bc680a 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/executor_cache.h"
+
 #include "paddle/fluid/framework/op_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
index 06019372a73..c6ccc2adc65 100644
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/executor_thread_worker.h"
+
 #include <algorithm>
 #include <utility>
+
+#include "gflags/gflags.h"
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/message.h"
 #include "google/protobuf/text_format.h"
-
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
@@ -616,8 +617,8 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) {
     int len = tensor->numel();
     CHECK(slot_dim * len == g_tensor->numel())
         << "len:" << len << " g_numel:" << g_tensor->numel();
-    CHECK(len == tensor->numel()) << "len:" << len
-                                  << "t_numel:" << tensor->numel();
+    CHECK(len == tensor->numel())
+        << "len:" << len << "t_numel:" << tensor->numel();
     int64_t* ids = tensor->data<int64_t>();
     for (auto id_idx = 0u; id_idx < len; ++id_idx) {
       if (ids[id_idx] == 0) {
@@ -626,15 +627,15 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) {
       }
       memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim);
       push_g[fea_idx][0] = 1.0f;
-      CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx
-                                       << " size:" << fea_info.size();
+      CHECK(fea_idx < fea_info.size())
+          << "fea_idx:" << fea_idx << " size:" << fea_info.size();
       push_g[fea_idx][1] = static_cast<float>(fea_info[fea_idx].label);
       g += slot_dim;
       fea_idx++;
     }
   }
-  CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx
-                                    << " features size:" << features.size();
+  CHECK(fea_idx == features.size())
+      << "fea_idx:" << fea_idx << " features size:" << features.size();
   CHECK_GT(features.size(), 0);
 
   std::vector<float*> push_g_vec;
@@ -701,5 +702,5 @@ void AsyncExecutorThreadWorker::check_pull_push_memory(
 }
 #endif
 
-}  // einit_modelnd namespace framework
+}  // namespace framework
 }  // end namespace paddle
diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h
index 524922b0322..f4fa54d2c3a 100644
--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 096134e8528..ec3fdc49fdf 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 
+#include <boost/variant.hpp>
 #include <string>
 
-#include <boost/variant.hpp>
 #include "glog/logging.h"
 
 namespace phi {
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index d5586212011..a4bd208959e 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -22,6 +22,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "ge/ge_api.h"
+#include "graph/attr_value.h"
+#include "graph/tensor.h"
+#include "graph/types.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -29,11 +33,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/timer.h"
 
-#include "ge/ge_api.h"
-#include "graph/attr_value.h"
-#include "graph/tensor.h"
-#include "graph/types.h"
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc
index 8564a421659..1bb432a791e 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cc
+++ b/paddle/fluid/framework/fleet/box_wrapper.cc
@@ -14,10 +14,12 @@
 
 #ifdef PADDLE_WITH_BOX_PS
 #include "paddle/fluid/framework/fleet/box_wrapper.h"
+
 #include <algorithm>
 #include <ctime>
 #include <memory>
 #include <numeric>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
@@ -186,26 +188,30 @@ void BasicAucCalculator::calculate_bucket_error() {
 void BoxWrapper::FeedPass(int date,
                           const std::vector<uint64_t>& feasgin_to_box) const {
   int ret = boxps_ptr_->FeedPass(date, feasgin_to_box);
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "FeedPass failed in BoxPS."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::PreconditionNotMet("FeedPass failed in BoxPS."));
 }
 
 void BoxWrapper::BeginFeedPass(int date, boxps::PSAgentBase** agent) const {
   int ret = boxps_ptr_->BeginFeedPass(date, *agent);
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "BeginFeedPass failed in BoxPS."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::PreconditionNotMet("BeginFeedPass failed in BoxPS."));
 }
 
 void BoxWrapper::EndFeedPass(boxps::PSAgentBase* agent) const {
   int ret = boxps_ptr_->EndFeedPass(agent);
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "EndFeedPass failed in BoxPS."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::PreconditionNotMet("EndFeedPass failed in BoxPS."));
 }
 
 void BoxWrapper::BeginPass() const {
   int ret = boxps_ptr_->BeginPass();
-  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                "BeginPass failed in BoxPS."));
+  PADDLE_ENFORCE_EQ(
+      ret, 0,
+      platform::errors::PreconditionNotMet("BeginPass failed in BoxPS."));
 }
 
 void BoxWrapper::SetTestMode(bool is_test) const {
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index aea479ed0b2..17e59ac9104 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -17,6 +17,7 @@
 #include <ctime>
 #include <memory>
 #include <numeric>
+
 #include "paddle/fluid/framework/fleet/box_wrapper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -175,13 +176,13 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
 #define EXPAND_EMBED_PULL_CASE(i, ...)                                       \
   case i: {                                                                  \
     constexpr size_t ExpandDim = i;                                          \
-    PullCopy<EmbedxDim,                                                      \
-             ExpandDim><<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( \
-        gpu_values,                                                          \
-        reinterpret_cast<boxps::FeatureValueGpu<EmbedxDim, ExpandDim>*>(     \
-            total_values_gpu),                                               \
-        gpu_len, hidden_size, expand_embed_dim, slot_num, total_length,      \
-        gpu_keys);                                                           \
+    PullCopy<EmbedxDim, ExpandDim>                                           \
+        <<<(total_length + 512 - 1) / 512, 512, 0, stream>>>(                \
+            gpu_values,                                                      \
+            reinterpret_cast<boxps::FeatureValueGpu<EmbedxDim, ExpandDim>*>( \
+                total_values_gpu),                                           \
+            gpu_len, hidden_size, expand_embed_dim, slot_num, total_length,  \
+            gpu_keys);                                                       \
   } break
 #endif
 
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index b043edca138..dc01df221e9 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include <sys/wait.h>
 #endif
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <atomic>
 #include <ctime>
@@ -36,6 +37,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -65,10 +67,12 @@ class BasicAucCalculator {
     _local_pred = 0;
   }
   void add_data(double pred, int label) {
-    PADDLE_ENFORCE_GE(pred, 0.0, platform::errors::PreconditionNotMet(
-                                     "pred should be greater than 0"));
-    PADDLE_ENFORCE_LE(pred, 1.0, platform::errors::PreconditionNotMet(
-                                     "pred should be lower than 1"));
+    PADDLE_ENFORCE_GE(
+        pred, 0.0,
+        platform::errors::PreconditionNotMet("pred should be greater than 0"));
+    PADDLE_ENFORCE_LE(
+        pred, 1.0,
+        platform::errors::PreconditionNotMet("pred should be lower than 1"));
     PADDLE_ENFORCE_EQ(
         label * label, label,
         platform::errors::PreconditionNotMet(
@@ -172,13 +176,15 @@ class AfsManager {
                                          pwd.c_str(), conf_path.c_str());
     VLOG(0) << "AFSAPI Init: user: " << user << ", pwd: " << pwd;
     int ret = _afshandler->Init(true, (com_logstatus() == 0));
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "Called AFSAPI Init Interface Failed."));
+    PADDLE_ENFORCE_EQ(ret, 0,
+                      platform::errors::PreconditionNotMet(
+                          "Called AFSAPI Init Interface Failed."));
     // Too high level will hurt the performance
     comlog_set_log_level(4);
     ret = _afshandler->Connect();
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "Called AFSAPI Connect Interface Failed"));
+    PADDLE_ENFORCE_EQ(ret, 0,
+                      platform::errors::PreconditionNotMet(
+                          "Called AFSAPI Connect Interface Failed"));
   }
   virtual ~AfsManager() {
     if (_afshandler != NULL) {
@@ -294,8 +300,9 @@ class AfsManager {
     int ret =
         PopenBidirectionalInternal(cmd.c_str(), rfp, wfp, pid, true, true);
 
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "Called PopenBidirectionalInternal Failed"));
+    PADDLE_ENFORCE_EQ(ret, 0,
+                      platform::errors::PreconditionNotMet(
+                          "Called PopenBidirectionalInternal Failed"));
     std::string filename(path);
     if (strncmp(filename.c_str(), "afs:", 4) == 0) {
       filename = filename.substr(4);
@@ -451,8 +458,9 @@ class BoxWrapper {
     std::string ret_str;
     int ret = boxps_ptr_->SaveBase(batch_model_path, xbox_model_path, ret_str,
                                    seconds_from_1970 / 86400);
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "SaveBase failed in BoxPS."));
+    PADDLE_ENFORCE_EQ(
+        ret, 0,
+        platform::errors::PreconditionNotMet("SaveBase failed in BoxPS."));
     return ret_str;
   }
 
@@ -460,8 +468,9 @@ class BoxWrapper {
     VLOG(3) << "Begin SaveDelta";
     std::string ret_str;
     int ret = boxps_ptr_->SaveDelta(xbox_model_path, ret_str);
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "SaveDelta failed in BoxPS."));
+    PADDLE_ENFORCE_EQ(
+        ret, 0,
+        platform::errors::PreconditionNotMet("SaveDelta failed in BoxPS."));
     return ret_str;
   }
 
diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h
index 6f7009f4d51..f6f1cbfc2a0 100644
--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -79,8 +79,9 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
     int ret = boxps_ptr_->PullSparseGPU(
         total_keys, reinterpret_cast<void*>(total_values_gpu),
         static_cast<int>(total_length), device_id);
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "PullSparseGPU failed in BoxPS."));
+    PADDLE_ENFORCE_EQ(
+        ret, 0,
+        platform::errors::PreconditionNotMet("PullSparseGPU failed in BoxPS."));
     pull_boxps_timer.Pause();
 
     VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
@@ -144,8 +145,9 @@ void BoxWrapper::PushSparseGradCase(
     int ret = boxps_ptr_->PushSparseGPU(
         total_keys, reinterpret_cast<void*>(total_grad_values_gpu),
         static_cast<int>(total_length), place.GetDeviceId());
-    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-                                  "PushSparseGPU failed in BoxPS."));
+    PADDLE_ENFORCE_EQ(
+        ret, 0,
+        platform::errors::PreconditionNotMet("PushSparseGPU failed in BoxPS."));
     push_boxps_timer.Pause();
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index deb2b90c933..5c2be1e55f9 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <pslib.h>
 #endif
 #include <ThreadPool.h>
+
 #include <atomic>
 #include <ctime>
 #include <map>
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc
index d850d05d87f..56d0e1ec47e 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h
index 42ae73f9b13..1ecaf1318b0 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.h
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.h
@@ -214,8 +214,9 @@ class GlooWrapper {
           static_cast<void (*)(void*, const void*, const void*, size_t)>(
               &gloo::min<T>));
     } else {
-      PADDLE_ENFORCE_EQ(0, 1, paddle::platform::errors::InvalidArgument(
-                                  "AllReduce mode not known: " + mode));
+      PADDLE_ENFORCE_EQ(0, 1,
+                        paddle::platform::errors::InvalidArgument(
+                            "AllReduce mode not known: " + mode));
     }
     gloo::allreduce(opts);
 #else
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 823b60c5ef1..560607bd160 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HETERPS
 
 #include <ThreadPool.h>
+
 #include <algorithm>
 #include <map>
 #include <unordered_map>
diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
index 4ad32d1714f..da65cccb435 100644
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
@@ -22,6 +22,7 @@
 #define CONCURRENT_UNORDERED_MAP_CUH
 
 #include <thrust/pair.h>
+
 #include <cassert>
 #include <iostream>
 #include <iterator>
@@ -258,7 +259,7 @@ class cycle_iterator_adapter {
     return old;
   }
 
-  __host__ __device__ const cycle_iterator_adapter& operator++(int)const {
+  __host__ __device__ const cycle_iterator_adapter& operator++(int) const {
     cycle_iterator_adapter<iterator_type> old(m_begin, m_end, m_current);
     if (m_end == (m_current + 1))
       m_current = m_begin;
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
index 19c355c671a..2e7588d0ac4 100644
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -17,6 +17,7 @@
 #include <iostream>
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -284,6 +285,6 @@ struct NodeQueryResult {
   };
   ~NodeQueryResult() {}
 };
-}
-};
+}  // namespace framework
+};  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index ae57c2ebe93..5831863f7f5 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -14,7 +14,9 @@
 
 #pragma once
 #include <thrust/host_vector.h>
+
 #include <chrono>
+
 #include "heter_comm.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
@@ -123,7 +125,7 @@ class GpuPsGraphTable : public HeterComm<uint64_t, int64_t, int> {
   std::condition_variable cv_;
   int cpu_table_status;
 };
-}
-};
+}  // namespace framework
+};  // namespace paddle
 //#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h"
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
index 72b9cae41c0..ab33d2a9c05 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
@@ -15,6 +15,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
+
 #include <functional>
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
@@ -859,11 +860,10 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     constexpr int TILE_SIZE = BLOCK_WARPS * 16;
     const dim3 block(WARP_SIZE, BLOCK_WARPS);
     const dim3 grid((shard_len + TILE_SIZE - 1) / TILE_SIZE);
-    neighbor_sample_example_v2<
-        WARP_SIZE, BLOCK_WARPS,
-        TILE_SIZE><<<grid, block, 0, resource_->remote_stream(i, gpu_id)>>>(
-        graph, id_array, actual_size_array, sample_array, sample_size,
-        shard_len, default_value);
+    neighbor_sample_example_v2<WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+        <<<grid, block, 0, resource_->remote_stream(i, gpu_id)>>>(
+            graph, id_array, actual_size_array, sample_array, sample_size,
+            shard_len, default_value);
   }
 
   for (int i = 0; i < total_gpu; ++i) {
@@ -946,12 +946,12 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
       constexpr int TILE_SIZE_ = BLOCK_WARPS_ * 16;
       const dim3 block2(WARP_SIZE_, BLOCK_WARPS_);
       const dim3 grid2((number_on_cpu + TILE_SIZE_ - 1) / TILE_SIZE_);
-      copy_buffer_ac_to_final_place<WARP_SIZE_, BLOCK_WARPS_,
-                                    TILE_SIZE_><<<grid2, block2, 0, stream>>>(
-          gpu_buffers_ptr, gpu_ac_ptr, val, actual_sample_size,
-          thrust::raw_pointer_cast(t_index.data()) + 1,
-          thrust::raw_pointer_cast(cumsum_gpu_ac.data()), number_on_cpu,
-          sample_size);
+      copy_buffer_ac_to_final_place<WARP_SIZE_, BLOCK_WARPS_, TILE_SIZE_>
+          <<<grid2, block2, 0, stream>>>(
+              gpu_buffers_ptr, gpu_ac_ptr, val, actual_sample_size,
+              thrust::raw_pointer_cast(t_index.data()) + 1,
+              thrust::raw_pointer_cast(cumsum_gpu_ac.data()), number_on_cpu,
+              sample_size);
 
       delete[] merge_buffers;
       delete[] cpu_keys;
@@ -1027,13 +1027,13 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
   local_begin_pos = [0,3]
   sample_size = [2,3]
   */
-  std::function<int(int, int, int, int, int&, int&)> range_check = [](
-      int x, int y, int x1, int y1, int& x2, int& y2) {
-    if (y <= x1 || x >= y1) return 0;
-    y2 = min(y, y1);
-    x2 = max(x1, x);
-    return y2 - x2;
-  };
+  std::function<int(int, int, int, int, int&, int&)> range_check =
+      [](int x, int y, int x1, int y1, int& x2, int& y2) {
+        if (y <= x1 || x >= y1) return 0;
+        y2 = min(y, y1);
+        x2 = max(x1, x);
+        return y2 - x2;
+      };
   auto graph = gpu_graph_list[gpu_id];
   if (graph.node_size == 0) {
     return result;
@@ -1106,6 +1106,6 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
   return result;
   */
 }
-}
-};
+}  // namespace framework
+};  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
index c976bb67cb2..43f0101009d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -271,5 +271,5 @@ void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) {
       ->cpu_graph_table->export_partition_files(idx, file_path);
 }
 #endif
-}
-};
+}  // namespace framework
+};  // namespace paddle
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
index a34e752fc7e..d3c4dea5890 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
 namespace paddle {
@@ -73,5 +74,5 @@ class GraphGpuWrapper {
   void* graph_table;
 };
 #endif
-}
-};
+}  // namespace framework
+};  // namespace paddle
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h b/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h
index a7c043f1edf..7cec4fcfb83 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <time.h>
+
 #include <algorithm>
 #include <chrono>
 #include <cstdlib>
@@ -23,6 +24,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
@@ -106,7 +108,7 @@ class AllInGpuGraphSampler : public GraphSampler {
   // std::shared_ptr<std::mt19937_64> random;
   int gpu_num;
 };
-}
-};
+}  // namespace framework
+};  // namespace paddle
 #include "paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h"
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h
index ad4b00b11aa..e68612d57e2 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_sampler_inl.h
@@ -156,6 +156,6 @@ void AllInGpuGraphSampler::init(GpuPsGraphTable *g,
   this->gpu_num = g->gpu_num;
   graph_table = g->cpu_graph_table.get();
 }
-}
-};
+}  // namespace framework
+};  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index 234aa15ebf7..112a59c8fec 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
 #include <glog/logging.h>
+
 #include <limits>
 #include <memory>
 #include <vector>
@@ -36,6 +37,7 @@ limitations under the License. */
 #include "thrust/pair.h"
 #elif defined(__xpu__)
 #include <xpu/runtime.h>
+
 #include "xpu/kernel/cluster_header.h"
 #include "xpu/kernel/math.h"
 #include "xpu/kernel/simd.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index 57741c2c19b..c2e6cdc5c69 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HETERPS
 #include <thread>
+
 #include "paddle/fluid/framework/fleet/heter_ps/hashtable.h"
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
 
@@ -366,10 +367,10 @@ template class HashTable<long, long>;
 template class HashTable<long, unsigned long>;
 template class HashTable<long, unsigned int>;
 
-template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
-    cudaStream_t>(const unsigned long* d_keys,
-                  paddle::framework::FeatureValue* d_vals, size_t len,
-                  cudaStream_t stream);
+template void
+HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
+    const unsigned long* d_keys, paddle::framework::FeatureValue* d_vals,
+    size_t len, cudaStream_t stream);
 
 template void
 HashTable<unsigned long, paddle::framework::FeatureValue*>::get<cudaStream_t>(
@@ -395,10 +396,10 @@ template void HashTable<unsigned long, long>::get<cudaStream_t>(
 //    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
 //    stream);
 
-template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
-    cudaStream_t>(const unsigned long* d_keys,
-                  const paddle::framework::FeatureValue* d_vals, size_t len,
-                  cudaStream_t stream);
+template void
+HashTable<unsigned long, paddle::framework::FeatureValue>::insert<cudaStream_t>(
+    const unsigned long* d_keys, const paddle::framework::FeatureValue* d_vals,
+    size_t len, cudaStream_t stream);
 
 template void HashTable<unsigned long, paddle::framework::FeatureValue*>::
     insert<cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
@@ -438,21 +439,22 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
               paddle::framework::FeaturePushValue>,
     cudaStream_t>(const unsigned long* d_keys,
                   const paddle::framework::FeaturePushValue* d_grads,
-                  size_t len, Optimizer<paddle::framework::FeatureValue,
-                                        paddle::framework::FeaturePushValue>
-                                  sgd,
-                  cudaStream_t stream);
-
-template void
-HashTable<unsigned long, paddle::framework::FeatureValue*>::update<
-    Optimizer<paddle::framework::FeatureValue,
-              paddle::framework::FeaturePushValue>,
-    cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t len,
+                  size_t len,
                   Optimizer<paddle::framework::FeatureValue,
                             paddle::framework::FeaturePushValue>
                       sgd,
                   cudaStream_t stream);
 
+template void HashTable<unsigned long, paddle::framework::FeatureValue*>::
+    update<Optimizer<paddle::framework::FeatureValue,
+                     paddle::framework::FeaturePushValue>,
+           cudaStream_t>(const unsigned long* d_keys, const char* d_grads,
+                         size_t len,
+                         Optimizer<paddle::framework::FeatureValue,
+                                   paddle::framework::FeaturePushValue>
+                             sgd,
+                         cudaStream_t stream);
+
 // template void HashTable<unsigned long,
 // paddle::framework::FeatureValue>::update<
 //    Optimizer<paddle::framework::FeatureValue,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 815f06b0824..d016cdf4e09 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <thread>
 #include <vector>
+
 #include "cub/cub.cuh"
 #include "cub/util_allocator.cuh"
 #if defined(PADDLE_WITH_CUDA)
@@ -26,6 +27,7 @@ limitations under the License. */
 #elif defined(PADDLE_WITH_XPU_KP)
 // #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
 #include <xpu/runtime.h>
+
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #endif
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 64b177abb86..38a4e7b7bb1 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
 #include <queue>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
index 94d7929b294..a5ee8e2ff83 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
@@ -294,10 +294,10 @@ template void HeterCommKernel::fill_idx<uint32_t, cudaStream_t>(
 template void HeterCommKernel::calc_shard_offset<int, cudaStream_t>(
     int* idx, int* left, int* right, long long len, int total_devs,
     const cudaStream_t& stream);
-template void HeterCommKernel::calc_shard_index<
-    unsigned long, int, cudaStream_t>(unsigned long* d_keys, long long len,
-                                      int* shard_index, int total_devs,
-                                      const cudaStream_t& stream);
+template void
+HeterCommKernel::calc_shard_index<unsigned long, int, cudaStream_t>(
+    unsigned long* d_keys, long long len, int* shard_index, int total_devs,
+    const cudaStream_t& stream);
 
 template void HeterCommKernel::calc_shard_index<long, int, cudaStream_t>(
     long* d_keys, long long len, int* shard_index, int total_devs,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
index 700b43f18fb..fe8e8c86505 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
+
 #include <vector>
 
 #ifdef PADDLE_WITH_HETERPS
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index 43b84ee5d26..cfe46626294 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
 
 #ifdef PADDLE_WITH_HETERPS
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index 8449a4048b7..83dc232bc6a 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
 #if defined(PADDLE_WITH_CUDA)
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index 2c312e9d4d6..fe44c81fe44 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
index 5717f44d400..087877818f5 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -24,6 +24,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU_KP
 #include <xpu/runtime.h>  // NOLINT
+
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #endif
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
index 4684b4a0bc1..82090ef4817 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <curand_kernel.h>
 #endif
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_comm.cu b/paddle/fluid/framework/fleet/heter_ps/test_comm.cu
index 3a6ed50ad8e..72fa0282066 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_comm.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_comm.cu
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
index 62a0df94300..621c7f5bab4 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
index ff3cd9d2d04..49e9a051ec0 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
index 06c7026eb51..28098181b6c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu
index affa60d022e..a1e8f06368b 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <unistd.h>
+
+#include <chrono>
 #include <condition_variable>  // NOLINT
 #include <fstream>
 #include <iomanip>
@@ -20,32 +22,30 @@
 #include <thread>  // NOLINT
 #include <unordered_set>
 #include <vector>
-#include "google/protobuf/text_format.h"
 
-#include <chrono>
+#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_sampler.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
-#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
-#include "paddle/fluid/framework/fleet/heter_ps/graph_sampler.h"
-#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
-#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
-#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
-
 using namespace paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
diff --git a/paddle/fluid/framework/fleet/metrics.cc b/paddle/fluid/framework/fleet/metrics.cc
index 56bc568460b..42252816405 100644
--- a/paddle/fluid/framework/fleet/metrics.cc
+++ b/paddle/fluid/framework/fleet/metrics.cc
@@ -17,6 +17,7 @@
 #include <ctime>
 #include <memory>
 #include <numeric>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 
 #if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
@@ -63,10 +64,12 @@ void BasicAucCalculator::add_data(const float* d_pred, const int64_t* d_label,
 }
 
 void BasicAucCalculator::add_unlock_data(double pred, int label) {
-  PADDLE_ENFORCE_GE(pred, 0.0, platform::errors::PreconditionNotMet(
-                                   "pred should be greater than 0"));
-  PADDLE_ENFORCE_LE(pred, 1.0, platform::errors::PreconditionNotMet(
-                                   "pred should be lower than 1"));
+  PADDLE_ENFORCE_GE(
+      pred, 0.0,
+      platform::errors::PreconditionNotMet("pred should be greater than 0"));
+  PADDLE_ENFORCE_LE(
+      pred, 1.0,
+      platform::errors::PreconditionNotMet("pred should be lower than 1"));
   PADDLE_ENFORCE_EQ(
       label * label, label,
       platform::errors::PreconditionNotMet(
@@ -272,10 +275,12 @@ void BasicAucCalculator::add_uid_data(const float* d_pred,
 
 void BasicAucCalculator::add_uid_unlock_data(double pred, int label,
                                              uint64_t uid) {
-  PADDLE_ENFORCE_GE(pred, 0.0, platform::errors::PreconditionNotMet(
-                                   "pred should be greater than 0"));
-  PADDLE_ENFORCE_LE(pred, 1.0, platform::errors::PreconditionNotMet(
-                                   "pred should be lower than 1"));
+  PADDLE_ENFORCE_GE(
+      pred, 0.0,
+      platform::errors::PreconditionNotMet("pred should be greater than 0"));
+  PADDLE_ENFORCE_LE(
+      pred, 1.0,
+      platform::errors::PreconditionNotMet("pred should be lower than 1"));
   PADDLE_ENFORCE_EQ(
       label * label, label,
       platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/fleet/metrics.h b/paddle/fluid/framework/fleet/metrics.h
index 69b242664bb..7c3ea1b5512 100644
--- a/paddle/fluid/framework/fleet/metrics.h
+++ b/paddle/fluid/framework/fleet/metrics.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <ThreadPool.h>
+
 #include <atomic>
 #include <ctime>
 #include <map>
@@ -35,6 +36,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/allreduce.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 488a9ef8ce7..fbe76696114 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <ctime>
 #include <memory>
 #include <numeric>
+
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index 0efec57e59d..7ddc5a1f6dd 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #include <vector>
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/broadcast.h>
+
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
diff --git a/paddle/fluid/framework/fleet/test_fleet.cc b/paddle/fluid/framework/fleet/test_fleet.cc
index 24f3e6bed64..34aea9de3b1 100644
--- a/paddle/fluid/framework/fleet/test_fleet.cc
+++ b/paddle/fluid/framework/fleet/test_fleet.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #include "paddle/fluid/string/string_helper.h"
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index b621eca35b8..e3b9fe3626d 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/generator.h"
 
 #include <glog/logging.h>
+
 #include <memory>
 #include <utility>
 
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index 35efc1bee33..f62e8f74d26 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <stdint.h>
+
 #include <atomic>
 #include <deque>
 #include <iostream>  // temp for debug
diff --git a/paddle/fluid/framework/gpu_utils.h b/paddle/fluid/framework/gpu_utils.h
index 37c9852a1ab..9c59333000e 100644
--- a/paddle/fluid/framework/gpu_utils.h
+++ b/paddle/fluid/framework/gpu_utils.h
@@ -17,6 +17,7 @@
 #define EIGEN_USE_GPU
 
 #include <array>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
@@ -104,15 +105,17 @@ ConvertTensorIndex(int index, const Dim3& dims) {
 
 template <typename IntType, bool ceil>
 IntType CeilOrFloor(IntType x, IntType deviser) {
-  PADDLE_ENFORCE_GT(deviser, 0, platform::errors::InvalidArgument(
-                                    "deviser should be greater than 0, "
-                                    "but received is:%d",
-                                    deviser));
+  PADDLE_ENFORCE_GT(
+      deviser, 0,
+      platform::errors::InvalidArgument("deviser should be greater than 0, "
+                                        "but received is:%d",
+                                        deviser));
 
   PADDLE_ENFORCE_GT(
-      x, 0, platform::errors::InvalidArgument("input should be greater than 0, "
-                                              "but received is:%d",
-                                              x));
+      x, 0,
+      platform::errors::InvalidArgument("input should be greater than 0, "
+                                        "but received is:%d",
+                                        x));
 
   const IntType round_to_zero = x / deviser;
   const IntType inte_result = round_to_zero * deviser;
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index ebbfd446a03..81f17be867f 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
@@ -157,8 +158,9 @@ class GradOpDescMakerBase {
   const Attribute& GetAttr(const std::string& name) const {
     auto& map = fwd_op_.GetAttrMap();
     auto it = map.find(name);
-    PADDLE_ENFORCE_NE(it, map.end(), platform::errors::NotFound(
-                                         "Cannot find attribute (%s).", name));
+    PADDLE_ENFORCE_NE(
+        it, map.end(),
+        platform::errors::NotFound("Cannot find attribute (%s).", name));
     return it->second;
   }
 
diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h
index 9d0e3c50953..6b115d33d2f 100644
--- a/paddle/fluid/framework/heter_service.h
+++ b/paddle/fluid/framework/heter_service.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <unordered_map>  // NOLINT
 #include <unordered_set>  // NOLINT
 #include <vector>
+
 #include "paddle/fluid/framework/heter_service.pb.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/framework/hetercpu_worker.cc b/paddle/fluid/framework/hetercpu_worker.cc
index 75cc18887da..85e44ec44c6 100644
--- a/paddle/fluid/framework/hetercpu_worker.cc
+++ b/paddle/fluid/framework/hetercpu_worker.cc
@@ -311,8 +311,8 @@ void HeterCpuWorker::CollectLabelInfo(std::shared_ptr<HeterTask> task,
       continue;
     }
     LoDTensor* tensor = fea_var->GetMutable<LoDTensor>();
-    CHECK(tensor != nullptr) << "tensor of var "
-                             << sparse_key_names_[table_id][i] << " is null";
+    CHECK(tensor != nullptr)
+        << "tensor of var " << sparse_key_names_[table_id][i] << " is null";
 
     // skip slots which do not have embedding
     Variable* emb_var = scope->FindVar(sparse_value_names_[table_id][i]);
@@ -465,9 +465,9 @@ void HeterCpuWorker::AdjustInsWeight(std::shared_ptr<HeterTask> task) {
   float* ins_weights = ins_weight_tensor->data<float>();
   size_t len = ins_weight_tensor->numel();  // len = batch size
   // here we assume nid_show slot only has one feasign in each instance
-  CHECK(len == nid_show_.size()) << "ins_weight size should be equal to "
-                                 << "nid_show size, " << len << " vs "
-                                 << nid_show_.size();
+  CHECK(len == nid_show_.size())
+      << "ins_weight size should be equal to "
+      << "nid_show size, " << len << " vs " << nid_show_.size();
   float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold();
   float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio();
   int64_t nid_adjw_num = 0;
@@ -482,9 +482,8 @@ void HeterCpuWorker::AdjustInsWeight(std::shared_ptr<HeterTask> task) {
     }
     float ins_weight = 1.0;
     if (nid_show >= 0 && nid_show < nid_adjw_threshold) {
-      ins_weight = log(M_E +
-                       (nid_adjw_threshold - nid_show) / nid_adjw_threshold *
-                           nid_adjw_ratio);
+      ins_weight = log(M_E + (nid_adjw_threshold - nid_show) /
+                                 nid_adjw_threshold * nid_adjw_ratio);
       // count nid adjw insnum and weight
       ++nid_adjw_num;
       nid_adjw_weight += ins_weight;
@@ -579,15 +578,15 @@ void HeterCpuWorker::CopyDenseVars() {
     Variable* src_var = thread_scope_->FindVar(src_var_name);
     CHECK(src_var != nullptr) << src_var_name << " not found";  // NOLINT
     LoDTensor* src_tensor = src_var->GetMutable<LoDTensor>();
-    CHECK(src_tensor != nullptr) << src_var_name
-                                 << " tensor is null";  // NOLINT
+    CHECK(src_tensor != nullptr)
+        << src_var_name << " tensor is null";  // NOLINT
     float* src_data = src_tensor->data<float>();
 
     Variable* dest_var = thread_scope_->FindVar(dest_var_name);
     CHECK(dest_var != nullptr) << dest_var_name << " not found";  // NOLINT
     LoDTensor* dest_tensor = dest_var->GetMutable<LoDTensor>();
-    CHECK(dest_tensor != nullptr) << dest_var_name
-                                  << " tensor is null";  // NOLINT
+    CHECK(dest_tensor != nullptr)
+        << dest_var_name << " tensor is null";  // NOLINT
     float* dest_data = dest_tensor->data<float>();
 
     CHECK(src_tensor->numel() == dest_tensor->numel())
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index a4af56419a7..81c1a684959 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <ctime>
 #include <string>
 #include <vector>
+
 #include "io/fs.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_feed_factory.h"
diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc
index 2eeefb19a1a..805f992cf3e 100644
--- a/paddle/fluid/framework/infershape_utils_test.cc
+++ b/paddle/fluid/framework/infershape_utils_test.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
+
 #include <string>
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h
index c46a77f0b35..93bbec251fe 100644
--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
 
diff --git a/paddle/fluid/framework/io/crypto/aes_cipher_test.cc b/paddle/fluid/framework/io/crypto/aes_cipher_test.cc
index 7f923f597b6..67c758b012a 100644
--- a/paddle/fluid/framework/io/crypto/aes_cipher_test.cc
+++ b/paddle/fluid/framework/io/crypto/aes_cipher_test.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/io/crypto/aes_cipher.h"
+
 #include <cryptopp/cryptlib.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <fstream>
 #include <string>
+
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/io/crypto/cipher.cc b/paddle/fluid/framework/io/crypto/cipher.cc
index eca175c020c..2001e8a416a 100644
--- a/paddle/fluid/framework/io/crypto/cipher.cc
+++ b/paddle/fluid/framework/io/crypto/cipher.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/io/crypto/cipher.h"
+
 #include "paddle/fluid/framework/io/crypto/aes_cipher.h"
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/io/crypto/cipher_utils.cc b/paddle/fluid/framework/io/crypto/cipher_utils.cc
index ee9f06b2f3e..b622138f781 100644
--- a/paddle/fluid/framework/io/crypto/cipher_utils.cc
+++ b/paddle/fluid/framework/io/crypto/cipher_utils.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 
 #include <cryptopp/osrng.h>
+
 #include <sstream>
 
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/io/crypto/cipher_utils_test.cc b/paddle/fluid/framework/io/crypto/cipher_utils_test.cc
index 928e2ced9b1..356c919cbcb 100644
--- a/paddle/fluid/framework/io/crypto/cipher_utils_test.cc
+++ b/paddle/fluid/framework/io/crypto/cipher_utils_test.cc
@@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/io/crypto/cipher_utils.h"
+
 #include <gtest/gtest.h>
+
 #include <fstream>
 #include <string>
 
-#include "paddle/fluid/framework/io/crypto/cipher_utils.h"
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/io/fs.cc b/paddle/fluid/framework/io/fs.cc
index b8aca886e7d..fd602895aae 100644
--- a/paddle/fluid/framework/io/fs.cc
+++ b/paddle/fluid/framework/io/fs.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/io/fs.h"
 
 #include <sys/stat.h>
+
 #include <memory>
 
 #include "glog/logging.h"
diff --git a/paddle/fluid/framework/io/fs.h b/paddle/fluid/framework/io/fs.h
index 1ebe80e943a..088d4d97424 100644
--- a/paddle/fluid/framework/io/fs.h
+++ b/paddle/fluid/framework/io/fs.h
@@ -16,6 +16,7 @@
 
 #include <stdint.h>
 #include <stdio.h>
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/framework/io/test_fs.cc b/paddle/fluid/framework/io/test_fs.cc
index 49dee603200..adb6141fd56 100644
--- a/paddle/fluid/framework/io/test_fs.cc
+++ b/paddle/fluid/framework/io/test_fs.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <fstream>
+
 #include "paddle/fluid/framework/io/fs.h"
 
 #if defined _WIN32 || defined __APPLE__
diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc
index 8870b68fbc5..e0ce58121a1 100644
--- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc
+++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/add_support_int8_pass.cc b/paddle/fluid/framework/ir/add_support_int8_pass.cc
index 3a3f5c3741f..d38853bb964 100644
--- a/paddle/fluid/framework/ir/add_support_int8_pass.cc
+++ b/paddle/fluid/framework/ir/add_support_int8_pass.cc
@@ -68,9 +68,8 @@ void AddSupportInt8Pass::ApplyImpl(ir::Graph* graph) const {
                    i++) {
                 if (quanted_op_desc->Output(quanted_op_desc->OutputNames()[i])
                             .size() > 0 &&
-                    input_name ==
-                        quanted_op_desc->Output(
-                            quanted_op_desc->OutputNames()[i])[0]) {
+                    input_name == quanted_op_desc->Output(
+                                      quanted_op_desc->OutputNames()[i])[0]) {
                   outscale_flag = true;
                   quanted_op_desc->SetAttr(
                       quanted_op_desc->OutputNames()[i],
diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
index 08e7c6f5b86..910cb5801db 100644
--- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
+
 #include <algorithm>
 #include <string>
+
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
index ae843aad7d3..710f8ef1b37 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc
index 6086409ffd9..05c7834c9ca 100644
--- a/paddle/fluid/framework/ir/cost_model.cc
+++ b/paddle/fluid/framework/ir/cost_model.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/cost_model.h"
 
 #include <memory>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/errors.h"
diff --git a/paddle/fluid/framework/ir/cost_model_test.cc b/paddle/fluid/framework/ir/cost_model_test.cc
index 57f3904d845..f5eaa2f0338 100644
--- a/paddle/fluid/framework/ir/cost_model_test.cc
+++ b/paddle/fluid/framework/ir/cost_model_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/cost_model.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc
index 2d270f444ad..2711ddf92d7 100644
--- a/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/cudnn_placement_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/cudnn_placement_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/operator.h"
 
diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
index 9473cc06928..5043beef824 100644
--- a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <string>
-
 #include "paddle/fluid/framework/ir/delete_dropout_op_pass.h"
 
+#include <string>
+
 namespace phi {
 class DenseTensor;
 }  // namespace phi
diff --git a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
index 79a06572d14..e4b6e43e5c3 100644
--- a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/delete_fill_constant_op_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
index 2fc133edb7a..a02efc0a7ce 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
@@ -102,9 +102,10 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
         break;
       }
     }
-    PADDLE_ENFORCE_GT(arg_name.size(), 0, platform::errors::InvalidArgument(
-                                              "can not find the input %s.",
-                                              quant_dequant_op_out_name));
+    PADDLE_ENFORCE_GT(
+        arg_name.size(), 0,
+        platform::errors::InvalidArgument("can not find the input %s.",
+                                          quant_dequant_op_out_name));
     // any_op2_desc->SetAttr("enable_int8", true);
     any_op2_desc->SetAttr("bit_length", bit_length);
 
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
index 727e42629f9..8deaf10d200 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
index 482e38355c5..a34e0a5d1de 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h"
 
 #include <string>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
index 46a9b2eae35..be22ee9b2fe 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h"
-
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 1e25b21483b..1802616c0df 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index 39b544e7160..e40759cd3fb 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h
index df3fbc293b7..9ad3c28f09a 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h
@@ -13,9 +13,9 @@
 // limitations under the License.
 #pragma once
 
-#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index b99e607f92b..5b4bb98ff53 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h
index a313e49f0b2..3e47f079573 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h
index ab66fb4a46a..632bb237fa2 100644
--- a/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h
+++ b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <memory>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index f12273e94dd..6a2a0867048 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fuse_bn_act_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
index 005f006ab04..ff4850838c5 100644
--- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fuse_bn_add_act_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
index 62f65baf336..3feea822bc1 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
index b72a63d3785..1c6b856d987 100644
--- a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
@@ -14,7 +14,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
index 0094b674c2a..9629b9209c4 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <sys/types.h>
+
 #include <string>
 
 #include "glog/logging.h"
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
index f87d31cbc40..e290bdf99ce 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
@@ -67,8 +67,9 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass {
           platform::errors::InvalidArgument(
               "All momentum Op's attr(use_nesterov) must be same, but there "
               "are two different value: %d, %d.",
-              use_nesterov, BOOST_GET_CONST(bool, momentum_op->Op()->GetAttr(
-                                                      "use_nesterov"))));
+              use_nesterov,
+              BOOST_GET_CONST(bool,
+                              momentum_op->Op()->GetAttr("use_nesterov"))));
       PADDLE_ENFORCE_EQ(
           op_role,
           BOOST_GET_CONST(int, momentum_op->Op()->GetAttr(
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index 40e1de8a523..e3e5221531e 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/core/kernel_factory.h"
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
index 56ca98b5660..bcfa69ac2e7 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h"
+
 #include <algorithm>
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
index 5b125030a7a..a8a09d69023 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/fusion_group/code_generator.h"
+
 #include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h"
 #include "paddle/fluid/framework/ir/fusion_group/cuda_resources.h"
 
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
index 18bd6d623b7..650ed965067 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <sstream>
 #include <string>
+
 #include "paddle/fluid/framework/ir/fusion_group/operation.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 7b6bbf02510..a24a9af158e 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <string>
 
diff --git a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
index 6fa3044affc..5be4091ca8b 100644
--- a/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
+++ b/paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/ir/fusion_group/operation.h"
diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
index 85d34405c5e..44df3a837f6 100644
--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h"
+
 #include "paddle/fluid/framework/ir/fusion_group/code_generator.h"
 #include "paddle/fluid/framework/ir/fusion_group/elementwise_group_detector.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
index db22c03a7d9..402fad0e84c 100644
--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc
index 2b7a3e1899c..7d1b7bafa13 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/fusion_group/operation.h"
+
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/fusion_group/subgraph.h b/paddle/fluid/framework/ir/fusion_group/subgraph.h
index 5a29e875aea..1c334e70f1c 100644
--- a/paddle/fluid/framework/ir/fusion_group/subgraph.h
+++ b/paddle/fluid/framework/ir/fusion_group/subgraph.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fusion_group/operation.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc
index 02c9d8e1c0c..00d69c9d5d2 100644
--- a/paddle/fluid/framework/ir/generate_pass.cc
+++ b/paddle/fluid/framework/ir/generate_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/generate_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
@@ -234,178 +235,183 @@ bool IsDuplicatePattern(const GraphPatternDetector::subgraph_t& subgraph,
 
 GraphPatternDetector::handle_t GetGenerateDelete(
     const PDPattern& pattern, const proto::PassDesc& pass_desc) {
-  GraphPatternDetector::handle_t handler = [&](
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-    if (IsDuplicatePattern(subgraph, graph)) {
-      return;
-    }
-    // `var_node_maps` record the mapping of variable to the pattern subgraph.
-    std::map<std::string, Node*> var_node_maps;
-    for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) {
-      Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var()));
-      const auto& iter = var_node_maps.find(var_map.replace_var());
-      if (var_node_maps.end() == iter) {
-        // first node is input
-        var_node_maps.insert({var_map.replace_var(), node});
-      } else {
-        // output node
-        for (Node* s_node : node->outputs) {
-          iter->second->outputs.push_back(s_node);
-          std::replace(s_node->inputs.begin(), s_node->inputs.end(), node,
-                       iter->second);
-          s_node->Op()->RenameInput(node->Name(), iter->second->Name());
+  GraphPatternDetector::handle_t handler =
+      [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+        if (IsDuplicatePattern(subgraph, graph)) {
+          return;
         }
-      }
-    }
-    // Remove nodes that are intermediate.
-    std::unordered_set<const Node*> remove_nodes;
-    for (const std::unique_ptr<PDNode>& pdnode : pattern.nodes()) {
-      remove_nodes.emplace(subgraph.at(pdnode.get()));
-    }
-    for (auto iter : var_node_maps) {
-      remove_nodes.erase(iter.second);
-    }
-    GraphSafeRemoveNodes(graph, remove_nodes);
-  };
+        // `var_node_maps` record the mapping of variable to the pattern
+        // subgraph.
+        std::map<std::string, Node*> var_node_maps;
+        for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) {
+          Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var()));
+          const auto& iter = var_node_maps.find(var_map.replace_var());
+          if (var_node_maps.end() == iter) {
+            // first node is input
+            var_node_maps.insert({var_map.replace_var(), node});
+          } else {
+            // output node
+            for (Node* s_node : node->outputs) {
+              iter->second->outputs.push_back(s_node);
+              std::replace(s_node->inputs.begin(), s_node->inputs.end(), node,
+                           iter->second);
+              s_node->Op()->RenameInput(node->Name(), iter->second->Name());
+            }
+          }
+        }
+        // Remove nodes that are intermediate.
+        std::unordered_set<const Node*> remove_nodes;
+        for (const std::unique_ptr<PDNode>& pdnode : pattern.nodes()) {
+          remove_nodes.emplace(subgraph.at(pdnode.get()));
+        }
+        for (auto iter : var_node_maps) {
+          remove_nodes.erase(iter.second);
+        }
+        GraphSafeRemoveNodes(graph, remove_nodes);
+      };
   return handler;
 }
 
 GraphPatternDetector::handle_t GetGenerateRewrite(
     const PDPattern& pattern, const proto::PassDesc& pass_desc) {
-  GraphPatternDetector::handle_t handler = [&](
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-    if (IsDuplicatePattern(subgraph, graph)) {
-      return;
-    }
-    for (const auto& condition : pass_desc.var_attr_conditions()) {
-      if (condition.has_condition_attr()) {
-        Node* node =
-            subgraph.at(pattern.RetrieveNode(condition.attr().var_name()));
-        Attribute node_attr = GetVarAttrValue(node->Var(), condition.attr());
-        Attribute condition_attr;
-        if (condition.condition_attr().role() ==
-            proto::PassDesc_RoleType_kVariable) {
-          Node* condition_node =
-              subgraph.at(pattern.RetrieveNode(condition.attr().var_name()));
-          condition_attr = GetVarAttrValue(condition_node->Var(),
-                                           condition.condition_attr());
-        } else {
-          PADDLE_THROW(
-              platform::errors::Unimplemented("Unimplemented for operation."));
-        }
-        bool check_failed = false;
-        if (condition.type() == proto::PassDesc_ConditionType_kEQ) {
-          check_failed = !(node_attr == condition_attr);
-        }
-        if (check_failed) {
-          VLOG(3) << "Check var [" << node->Name() << "] with attr ["
-                  << condition.attr().name() << "] failed, skip this pattern.";
+  GraphPatternDetector::handle_t handler =
+      [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+        if (IsDuplicatePattern(subgraph, graph)) {
           return;
         }
-      }
-    }
-    // `var_node_maps` record the mapping of variable to the pattern subgraph.
-    std::map<std::string, Node*> var_node_maps;
-    for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) {
-      Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var()));
-      var_node_maps.insert({var_map.replace_var(), node});
-    }
-    // Traverse all operators to create subgraph.
-    for (int index = 0; index < pass_desc.replace_size(); ++index) {
-      const proto::OpDesc& op = pass_desc.replace(index);
-      OpDesc op_desc;
-      std::vector<Node *> in_nodes, out_nodes;
-      op_desc.SetType(op.type());
-      // Create Nodes for inputs of current operator.
-      for (const proto::OpDesc::Var& var : op.inputs()) {
-        std::vector<std::string> arguments;
-        for (const std::string& argument : var.arguments()) {
-          // The input may be mapped on the operator of pattern subgraph.
-          Node* node = nullptr;
-          auto iter = var_node_maps.find(argument);
-          if (var_node_maps.end() == iter) {
-            VarDesc var_desc(patterns::UniqueKey(argument));
-            node = graph->CreateVarNode(&var_desc);
-            var_node_maps.insert({argument, node});
-          } else {
-            node = iter->second;
-          }
-          in_nodes.push_back(node);
-          arguments.push_back(node->Name());
-        }
-        op_desc.SetInput(var.parameter(), arguments);
-      }
-      // Create Nodes for outputs of current operator.
-      for (const proto::OpDesc::Var& var : op.outputs()) {
-        std::vector<std::string> arguments;
-        for (const std::string& argument : var.arguments()) {
-          // The output may be mapped on the operator of pattern subgraph.
-          Node* node = nullptr;
-          auto iter = var_node_maps.find(argument);
-          if (var_node_maps.end() == iter) {
-            VarDesc var_desc(patterns::UniqueKey(argument));
-            node = graph->CreateVarNode(&var_desc);
-            var_node_maps.insert({argument, node});
-          } else {
-            if (in_nodes.end() ==
-                std::find(in_nodes.begin(), in_nodes.end(), iter->second)) {
-              node = iter->second;
+        for (const auto& condition : pass_desc.var_attr_conditions()) {
+          if (condition.has_condition_attr()) {
+            Node* node =
+                subgraph.at(pattern.RetrieveNode(condition.attr().var_name()));
+            Attribute node_attr =
+                GetVarAttrValue(node->Var(), condition.attr());
+            Attribute condition_attr;
+            if (condition.condition_attr().role() ==
+                proto::PassDesc_RoleType_kVariable) {
+              Node* condition_node = subgraph.at(
+                  pattern.RetrieveNode(condition.attr().var_name()));
+              condition_attr = GetVarAttrValue(condition_node->Var(),
+                                               condition.condition_attr());
             } else {
-              node = graph->CreateVarNode(iter->second->Var());
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unimplemented for operation."));
+            }
+            bool check_failed = false;
+            if (condition.type() == proto::PassDesc_ConditionType_kEQ) {
+              check_failed = !(node_attr == condition_attr);
+            }
+            if (check_failed) {
+              VLOG(3) << "Check var [" << node->Name() << "] with attr ["
+                      << condition.attr().name()
+                      << "] failed, skip this pattern.";
+              return;
             }
           }
-          out_nodes.push_back(node);
-          arguments.push_back(node->Name());
         }
-        op_desc.SetOutput(var.parameter(), arguments);
-      }
-      // Set attribute for current operator.
-      for (const proto::OpDesc::Attr& attr : op.attrs()) {
-        op_desc.SetAttr(attr.name(), GetAttrValue(attr));
-      }
-      for (const auto& attr_map : pass_desc.op_attr_maps()) {
-        if (attr_map.replace_attr().op_index() == index) {
-          Attribute attr;
-          if (attr_map.pattern_attr().role() ==
-              proto::PassDesc_RoleType_kVariable) {
-            Node* condition_node = subgraph.at(
-                pattern.RetrieveNode(attr_map.pattern_attr().var_name()));
-            attr =
-                GetVarAttrValue(condition_node->Var(), attr_map.pattern_attr());
-          } else {
-            Node* condition_node = subgraph.at(pattern.RetrieveNode(
-                std::to_string(attr_map.pattern_attr().op_index())));
-            attr =
-                GetOpAttrValue(condition_node->Op(), attr_map.pattern_attr());
+        // `var_node_maps` record the mapping of variable to the pattern
+        // subgraph.
+        std::map<std::string, Node*> var_node_maps;
+        for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) {
+          Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var()));
+          var_node_maps.insert({var_map.replace_var(), node});
+        }
+        // Traverse all operators to create subgraph.
+        for (int index = 0; index < pass_desc.replace_size(); ++index) {
+          const proto::OpDesc& op = pass_desc.replace(index);
+          OpDesc op_desc;
+          std::vector<Node*> in_nodes, out_nodes;
+          op_desc.SetType(op.type());
+          // Create Nodes for inputs of current operator.
+          for (const proto::OpDesc::Var& var : op.inputs()) {
+            std::vector<std::string> arguments;
+            for (const std::string& argument : var.arguments()) {
+              // The input may be mapped on the operator of pattern subgraph.
+              Node* node = nullptr;
+              auto iter = var_node_maps.find(argument);
+              if (var_node_maps.end() == iter) {
+                VarDesc var_desc(patterns::UniqueKey(argument));
+                node = graph->CreateVarNode(&var_desc);
+                var_node_maps.insert({argument, node});
+              } else {
+                node = iter->second;
+              }
+              in_nodes.push_back(node);
+              arguments.push_back(node->Name());
+            }
+            op_desc.SetInput(var.parameter(), arguments);
+          }
+          // Create Nodes for outputs of current operator.
+          for (const proto::OpDesc::Var& var : op.outputs()) {
+            std::vector<std::string> arguments;
+            for (const std::string& argument : var.arguments()) {
+              // The output may be mapped on the operator of pattern subgraph.
+              Node* node = nullptr;
+              auto iter = var_node_maps.find(argument);
+              if (var_node_maps.end() == iter) {
+                VarDesc var_desc(patterns::UniqueKey(argument));
+                node = graph->CreateVarNode(&var_desc);
+                var_node_maps.insert({argument, node});
+              } else {
+                if (in_nodes.end() ==
+                    std::find(in_nodes.begin(), in_nodes.end(), iter->second)) {
+                  node = iter->second;
+                } else {
+                  node = graph->CreateVarNode(iter->second->Var());
+                }
+              }
+              out_nodes.push_back(node);
+              arguments.push_back(node->Name());
+            }
+            op_desc.SetOutput(var.parameter(), arguments);
+          }
+          // Set attribute for current operator.
+          for (const proto::OpDesc::Attr& attr : op.attrs()) {
+            op_desc.SetAttr(attr.name(), GetAttrValue(attr));
           }
-          if (attr_map.has_operation()) {
-            Attribute operation = GetAttrValue(attr_map.operation().value());
-            attr = boost::apply_visitor(
-                operation_visitor(attr_map.operation().type()), attr,
-                operation);
+          for (const auto& attr_map : pass_desc.op_attr_maps()) {
+            if (attr_map.replace_attr().op_index() == index) {
+              Attribute attr;
+              if (attr_map.pattern_attr().role() ==
+                  proto::PassDesc_RoleType_kVariable) {
+                Node* condition_node = subgraph.at(
+                    pattern.RetrieveNode(attr_map.pattern_attr().var_name()));
+                attr = GetVarAttrValue(condition_node->Var(),
+                                       attr_map.pattern_attr());
+              } else {
+                Node* condition_node = subgraph.at(pattern.RetrieveNode(
+                    std::to_string(attr_map.pattern_attr().op_index())));
+                attr = GetOpAttrValue(condition_node->Op(),
+                                      attr_map.pattern_attr());
+              }
+              if (attr_map.has_operation()) {
+                Attribute operation =
+                    GetAttrValue(attr_map.operation().value());
+                attr = boost::apply_visitor(
+                    operation_visitor(attr_map.operation().type()), attr,
+                    operation);
+              }
+              op_desc.SetAttr(attr_map.replace_attr().name(), attr);
+            }
+          }
+          // Create a Node for current operator.
+          Node* op_node = graph->CreateOpNode(&op_desc);
+          for (Node* node : in_nodes) {
+            IR_NODE_LINK_TO(node, op_node);
+          }
+          for (Node* node : out_nodes) {
+            IR_NODE_LINK_TO(op_node, node);
           }
-          op_desc.SetAttr(attr_map.replace_attr().name(), attr);
         }
-      }
-      // Create a Node for current operator.
-      Node* op_node = graph->CreateOpNode(&op_desc);
-      for (Node* node : in_nodes) {
-        IR_NODE_LINK_TO(node, op_node);
-      }
-      for (Node* node : out_nodes) {
-        IR_NODE_LINK_TO(op_node, node);
-      }
-    }
-    // Remove nodes that are intermediate.
-    std::unordered_set<const Node*> remove_nodes;
-    for (const std::unique_ptr<PDNode>& pdnode : pattern.nodes()) {
-      remove_nodes.emplace(subgraph.at(pdnode.get()));
-    }
-    for (auto iter : var_node_maps) {
-      remove_nodes.erase(iter.second);
-    }
-    GraphSafeRemoveNodes(graph, remove_nodes);
-  };
+        // Remove nodes that are intermediate.
+        std::unordered_set<const Node*> remove_nodes;
+        for (const std::unique_ptr<PDNode>& pdnode : pattern.nodes()) {
+          remove_nodes.emplace(subgraph.at(pdnode.get()));
+        }
+        for (auto iter : var_node_maps) {
+          remove_nodes.erase(iter.second);
+        }
+        GraphSafeRemoveNodes(graph, remove_nodes);
+      };
   return handler;
 }
 
diff --git a/paddle/fluid/framework/ir/generate_pass_tester.cc b/paddle/fluid/framework/ir/generate_pass_tester.cc
index 6876dde50c1..7e98b11215a 100644
--- a/paddle/fluid/framework/ir/generate_pass_tester.cc
+++ b/paddle/fluid/framework/ir/generate_pass_tester.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/generate_pass.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/generate_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 REGISTER_GENERATE_PASS(generate_fc_fuse) {
   paddle::framework::ir::PassPairs pass_pairs;
   for (bool with_relu : {true, false}) {
     // pattern
-    SUBGRAPH_(pattern) =
-        [ subgraph = &pattern, with_relu ](VAR_(x), VAR_(y), VAR_(z)) {
+    SUBGRAPH_(pattern) = [subgraph = &pattern, with_relu](VAR_(x), VAR_(y),
+                                                          VAR_(z)) {
       VLOG(3) << "exec lambda func.";
       auto mul = OP_(mul)({{"X", x}, {"Y", y}}).Out("Out");
       auto ewadd = OP_(elementwise_add)({{"X", mul}, {"Y", z}}).Out("Out");
@@ -32,8 +32,8 @@ REGISTER_GENERATE_PASS(generate_fc_fuse) {
       }
     };
     // replace
-    SUBGRAPH_(replace) =
-        [ subgraph = &replace, with_relu ](VAR_(x), VAR_(y), VAR_(z)) {
+    SUBGRAPH_(replace) = [subgraph = &replace, with_relu](VAR_(x), VAR_(y),
+                                                          VAR_(z)) {
       auto& fc = OP_(fc)({{"Input", x}, {"W", y}, {"Bias", z}});
       return fc.Out("Out");
     };
diff --git a/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc
index ac580b99b5c..8e58231e986 100644
--- a/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc
@@ -16,9 +16,9 @@
 
 #include <cmath>
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
-
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index f5f6f3ecb85..acf8f6ec643 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/ir/graph.h"
+
 #include <memory>
 
-#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/operator.h"
 
 PADDLE_DEFINE_EXPORTED_bool(convert_all_blocks, true,
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 10645f08dc3..40a6fbbade8 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <gflags/gflags.h>
+
 #include <map>
 #include <memory>
 #include <string>
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index ed7aa451d13..d4c7a607db3 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
+
 #include <queue>
 #include <stack>
+
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
@@ -421,8 +423,9 @@ std::vector<ir::Node *> TopologySortGraphByDescOrder(const Graph &graph) {
            DescOrderComparator>
       adj_list = BuildOperationAdjList<DescOrderComparator>(graph);
   PADDLE_ENFORCE_EQ(HasCircleInternal<DescOrderComparator>(adj_list, nullptr),
-                    false, platform::errors::InvalidArgument(
-                               "Generated graph shouldn't contain cycle."));
+                    false,
+                    platform::errors::InvalidArgument(
+                        "Generated graph shouldn't contain cycle."));
   std::unordered_set<ir::Node *> visited;
   std::vector<ir::Node *> ret;
   for (auto adj : adj_list) {
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
index 0a2dcfed000..5972cd40817 100644
--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/graph.h"
-#include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index ea101125b18..ca5a82708c5 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/operator.h"
@@ -70,8 +71,9 @@ void PDPattern::AddEdge(PDNode *a, PDNode *b) {
       a, platform::errors::NotFound("PDNode %s is not found.", a->name()));
   PADDLE_ENFORCE_NOT_NULL(
       b, platform::errors::NotFound("PDNode %s is not found.", b->name()));
-  PADDLE_ENFORCE_NE(a, b, platform::errors::PermissionDenied(
-                              "Cannot connect the same node in the graph."));
+  PADDLE_ENFORCE_NE(a, b,
+                    platform::errors::PermissionDenied(
+                        "Cannot connect the same node in the graph."));
   edges_.emplace_back(a, b);
 }
 
@@ -3062,11 +3064,10 @@ PDNode *patterns::ReshapeTransposeMatmulPattern::operator()(
     transpose_out->assert_is_only_output_of_op("transpose2");
 
   auto transpose_xshape =
-      with_transpose_xshape
-          ? pattern->NewNode(transpose_xshape_repr())
-                ->AsIntermediate()
-                ->assert_is_op_output("transpose2", "XShape")
-          : nullptr;
+      with_transpose_xshape ? pattern->NewNode(transpose_xshape_repr())
+                                  ->AsIntermediate()
+                                  ->assert_is_op_output("transpose2", "XShape")
+                            : nullptr;
 
   auto matmul_out = pattern->NewNode(matmul_out_repr())
                         ->AsOutput()
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
index 5ac5a5d9839..b02b2e13edc 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
@@ -152,12 +152,12 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
   x.mutable_pattern()->AddEdge(any_var, any_op1);
 
   int count = 0;
-  GraphPatternDetector::handle_t handle = [&](
-      const GraphPatternDetector::subgraph_t& s, Graph* g) {
-    LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
-              << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
-    count++;
-  };
+  GraphPatternDetector::handle_t handle =
+      [&](const GraphPatternDetector::subgraph_t& s, Graph* g) {
+        LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
+                  << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
+        count++;
+      };
 
   x(&graph, handle);
 
diff --git a/paddle/fluid/framework/ir/graph_printer.h b/paddle/fluid/framework/ir/graph_printer.h
index 76b07f0d653..1b0e059f122 100644
--- a/paddle/fluid/framework/ir/graph_printer.h
+++ b/paddle/fluid/framework/ir/graph_printer.h
@@ -15,11 +15,13 @@
 #pragma once
 
 #include <glog/logging.h>
+
 #include <fstream>
 #include <iosfwd>
 #include <memory>
 #include <ostream>
 #include <string>
+
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 1ff67ae0fe0..db18a735ce2 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc
index 3ad591c6dff..f57cdd9d974 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 
 #include <gflags/gflags.h>
+
 #include <algorithm>
 
 #include "paddle/fluid/framework/op_proto_maker.h"
diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc
index b0631456302..36bc3e6dd78 100644
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/graph_traits.h"
+
 #include <list>
 #include <map>
 
-#include "paddle/fluid/framework/ir/graph_traits.h"
-
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -76,21 +76,22 @@ NodesDFSIterator::NodesDFSIterator(const std::vector<Node *> &source) {
 }
 
 NodesDFSIterator::NodesDFSIterator(NodesDFSIterator &&other) noexcept
-    : stack_(std::move(other.stack_)),
-      visited_(std::move(other.visited_)) {}
+    : stack_(std::move(other.stack_)), visited_(std::move(other.visited_)) {}
 
 NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other)
     : stack_(other.stack_), visited_(other.visited_) {}
 
 Node &NodesDFSIterator::operator*() {
-  PADDLE_ENFORCE_EQ(stack_.empty(), false, platform::errors::OutOfRange(
-                                               "The iterator exceeds range."));
+  PADDLE_ENFORCE_EQ(
+      stack_.empty(), false,
+      platform::errors::OutOfRange("The iterator exceeds range."));
   return *stack_.top();
 }
 
 NodesDFSIterator &NodesDFSIterator::operator++() {
-  PADDLE_ENFORCE_EQ(stack_.empty(), false, platform::errors::OutOfRange(
-                                               "The iterator exceeds range."));
+  PADDLE_ENFORCE_EQ(
+      stack_.empty(), false,
+      platform::errors::OutOfRange("The iterator exceeds range."));
   visited_.insert(stack_.top());
   auto *cur = stack_.top();
   stack_.pop();
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 7311eb4b91d..da48d1d19b6 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_printer.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
index 6b91ea4e360..3d60148c170 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -46,42 +47,42 @@ void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const {
   scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
 
   int found_subgraph_count = 0;
-  GraphPatternDetector::handle_t handler = [&](
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-    Node* scale_op_var = subgraph.at(scale_op);
-    Node* scale_in_var = subgraph.at(scale_in);
-    Node* scale_out_var = subgraph.at(scale_out);
-    const std::string scale_in_name = scale_in_var->Name();
-    const std::string scale_out_name = scale_out_var->Name();
-    // Remove links in graph
-    GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var});
-    // Modify pre_op_desc
-    // Link pre_op directly to scale_out
-    for (auto& node : graph->Nodes()) {
-      if (node->IsOp()) {
-        auto* op_desc = node->Op();
-        auto out_vars_map = op_desc->Outputs();
-        for (auto out_var_map : out_vars_map) {
-          auto names = out_var_map.second;
-          bool reset = false;
-          for (size_t i = 0; i < names.size(); i++) {
-            if (names[i] == scale_in_name) {
-              reset = true;
-              names[i] = scale_out_name;
-              break;
+  GraphPatternDetector::handle_t handler =
+      [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+        Node* scale_op_var = subgraph.at(scale_op);
+        Node* scale_in_var = subgraph.at(scale_in);
+        Node* scale_out_var = subgraph.at(scale_out);
+        const std::string scale_in_name = scale_in_var->Name();
+        const std::string scale_out_name = scale_out_var->Name();
+        // Remove links in graph
+        GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var});
+        // Modify pre_op_desc
+        // Link pre_op directly to scale_out
+        for (auto& node : graph->Nodes()) {
+          if (node->IsOp()) {
+            auto* op_desc = node->Op();
+            auto out_vars_map = op_desc->Outputs();
+            for (auto out_var_map : out_vars_map) {
+              auto names = out_var_map.second;
+              bool reset = false;
+              for (size_t i = 0; i < names.size(); i++) {
+                if (names[i] == scale_in_name) {
+                  reset = true;
+                  names[i] = scale_out_name;
+                  break;
+                }
+              }
+              if (reset) {
+                op_desc->SetOutput(out_var_map.first, names);
+                op_desc->Flush();
+                IR_NODE_LINK_TO(node, scale_out_var);
+                break;
+              }
             }
           }
-          if (reset) {
-            op_desc->SetOutput(out_var_map.first, names);
-            op_desc->Flush();
-            IR_NODE_LINK_TO(node, scale_out_var);
-            break;
-          }
         }
-      }
-    }
-    found_subgraph_count++;
-  };
+        found_subgraph_count++;
+      };
 
   detector(graph, handler);
   AddStatis(found_subgraph_count);
diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
index f1ee3c26b8f..5c7373e1a77 100644
--- a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
@@ -14,10 +14,9 @@
 
 #include "paddle/fluid/framework/ir/ipu/avg_shard_pass.h"
 
-#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
-
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
index ebe40c3ee20..cbe57eae4c4 100644
--- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
index a6b82089dc4..df4ea7fac4b 100644
--- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
@@ -14,11 +14,10 @@
 
 #include "paddle/fluid/framework/ir/ipu/inference_process_pass.h"
 
-#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
-#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
-
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
index 4da913e7176..12d646e153b 100644
--- a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h"
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/platform/device/ipu/ipu_backend.h"
 #include "paddle/fluid/platform/device/ipu/ipu_names.h"
diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc
index bf0667aeafe..d2444295544 100644
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@@ -11,9 +11,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/ir/is_test_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/is_test_pass.h"
 #ifdef _WIN32
 #undef FALSE
 #undef TRUE
diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
index 4b0dc4809f5..1b7b06213fe 100644
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
+
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
index 93b6396bf7f..a72a59374f9 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@@ -29,31 +29,31 @@ class Node;
 class Graph;
 
 /*
-* Remove the sum op of all gradients of the backward op.
-* And remove the dependecies of the optimizer related to the
-* same backward op.
-*
-* Before this pass:
-*
-* forward_op1 forward_op2
-*     |            |
-*  grad_op1    grad_op2
-*        \      /
-*          \  /
-*         sum_op
-*           |
-*         sgd_op
-*
-* After this pass:
-* forward_op1 forward_op2
-*     |            |
-*  grad_op1    grad_op2
-*     |            |
-*  sgd_op1      sgd_op2
-*
-* sgd_op1 and sgd_op2 will update the same weight which holds the same
-* memory, so we could benefits from the acceleration
-*/
+ * Remove the sum op of all gradients of the backward op.
+ * And remove the dependecies of the optimizer related to the
+ * same backward op.
+ *
+ * Before this pass:
+ *
+ * forward_op1 forward_op2
+ *     |            |
+ *  grad_op1    grad_op2
+ *        \      /
+ *          \  /
+ *         sum_op
+ *           |
+ *         sgd_op
+ *
+ * After this pass:
+ * forward_op1 forward_op2
+ *     |            |
+ *  grad_op1    grad_op2
+ *     |            |
+ *  sgd_op1      sgd_op2
+ *
+ * sgd_op1 and sgd_op2 will update the same weight which holds the same
+ * memory, so we could benefits from the acceleration
+ */
 class LockFreeOptimizePass : public Pass {
  public:
   virtual ~LockFreeOptimizePass() {}
diff --git a/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc b/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc
index 2335e5eee01..a4bab58506e 100644
--- a/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/matmul_scale_fuse_pass.cc
@@ -16,9 +16,9 @@
 
 #include <cmath>
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
-
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
index b12b84d4a49..090673b87ed 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
@@ -321,13 +321,15 @@ size_t BufferSharedCrossOpMemoryReusePass::ResolveDependencyBetween(
 }
 
 void BufferSharedCrossOpMemoryReusePass::BuildOpDependencyMap() const {
-  PADDLE_ENFORCE_EQ(ops_.empty(), true, platform::errors::InvalidArgument(
-                                            "Ops must be initialized here."));
+  PADDLE_ENFORCE_EQ(
+      ops_.empty(), true,
+      platform::errors::InvalidArgument("Ops must be initialized here."));
   PADDLE_ENFORCE_EQ(
       op_to_idx_.empty(), true,
       platform::errors::InvalidArgument("Op to idx must be initialized here."));
-  PADDLE_ENFORCE_EQ(deps_.empty(), true, platform::errors::InvalidArgument(
-                                             "Deps must be initialized here."));
+  PADDLE_ENFORCE_EQ(
+      deps_.empty(), true,
+      platform::errors::InvalidArgument("Deps must be initialized here."));
 
   // Toposort ops
   OpGraphView graph_view(ir::FilterByNodeWrapper<OpHandleBase>(*graph_));
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
index 1ca6e989f27..682a72c5729 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@@ -166,8 +166,9 @@ static std::string GetFirstVarName(const OpDesc &op, const std::string &slot,
 static std::vector<std::vector<std::pair<std::string, std::string>>>
 GetInplaceVars(const BlockDesc &block, bool use_cuda,
                const std::vector<std::string> &skip_vars) {
-  PADDLE_ENFORCE_EQ(block.ID(), 0, platform::errors::Unimplemented(
-                                       "Inplace can only perform in block 0."));
+  PADDLE_ENFORCE_EQ(
+      block.ID(), 0,
+      platform::errors::Unimplemented("Inplace can only perform in block 0."));
   // only take block 0 gc_vars
   const auto op_gc_vars =
       GetEagerDeletionCleanVars(*block.Program(), skip_vars)[0];
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
index e89734bacec..8d593254f90 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
@@ -19,6 +19,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
index d6f286afc55..b5506dd1dcb 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
@@ -136,13 +136,15 @@ void OpGraphView::BreadthFirstVisit(Callback &&callback) const {
     }
   }
 
-  PADDLE_ENFORCE_EQ(num_calls, op_num, platform::errors::InvalidArgument(
-                                           "There are unvisited ops."));
+  PADDLE_ENFORCE_EQ(
+      num_calls, op_num,
+      platform::errors::InvalidArgument("There are unvisited ops."));
   PADDLE_ENFORCE_EQ(
       visited_ops.size(), op_num,
       platform::errors::InvalidArgument("There are unvisited ops."));
-  PADDLE_ENFORCE_EQ(op_deps.empty(), true, platform::errors::InvalidArgument(
-                                               "There are unvisited ops."));
+  PADDLE_ENFORCE_EQ(
+      op_deps.empty(), true,
+      platform::errors::InvalidArgument("There are unvisited ops."));
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
index 6077069ea74..b1fdb5e2160 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
@@ -26,9 +26,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+using paddle::operators::OpAndGradOpPair;
 using paddle::operators::OpVariant;
 using paddle::operators::OpVariantSet;
-using paddle::operators::OpAndGradOpPair;
 
 void RecurrentOpEagerDeletionPass::ApplyImpl(Graph *graph) const {
   // Find all recurrent_op and recurrent_grad_op in graph
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
index 313b2cc3345..3f88aaad57e 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
index 88bf9e38763..848b6e494ad 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <memory>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
index 4aa59d9196b..80f201d2d5a 100644
--- a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
+++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -25,9 +26,10 @@ void MixedPrecisionConfigurePass::InsertCastOps(
   VLOG(3) << "Insert the cast op before and after the kernel that does not "
              "supports fp16 precision";
 
-  auto update_cast_desc = [&](
-      framework::OpDesc& desc, const std::string& x_name,
-      const std::string& out_name, const int in_dtype, const int out_dtype) {
+  auto update_cast_desc = [&](framework::OpDesc& desc,
+                              const std::string& x_name,
+                              const std::string& out_name, const int in_dtype,
+                              const int out_dtype) {
     desc.SetType("cast");
     desc.SetInput("X", {x_name});
     desc.SetOutput("Out", {out_name});
diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
index 9f6cd8992dc..62145cb6a0f 100644
--- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc
index e13d44ac232..b1b546f085c 100644
--- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc
@@ -34,7 +34,7 @@ void SetBatchNormAttrs(OpDesc* bn_op, bool is_test = true,
   bn_op->SetAttr("fuse_with_relu", false);
   bn_op->SetAttr("epsilon", 0.001f);
 }
-}
+}  // namespace
 
 // ------------------------------ Test cases -----------------------------------
 
@@ -48,11 +48,12 @@ TEST(FuseBatchNormActOneDNNPass, ThrowIsTestTrainableStats) {
   auto prog = test::BuildProgramDesc(
       {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"},
       {"scale", "bias"});
-  auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"},
-                                                     {"Scale", "scale"},
-                                                     {"Bias", "bias"},
-                                                     {"Mean", "m"},
-                                                     {"Variance", "v"}},
+  auto* bn_op = test::CreateOp(&prog, "batch_norm",
+                               {{"X", "x"},
+                                {"Scale", "scale"},
+                                {"Bias", "bias"},
+                                {"Mean", "m"},
+                                {"Variance", "v"}},
                                {{"Y", "bn_y"},
                                 {"MeanOut", "m_out"},
                                 {"VarianceOut", "var_out"},
@@ -73,11 +74,12 @@ TEST(FuseBatchNormActOneDNNPass, ThrowIsTestTrainableStats) {
 TEST(FuseBatchNormActOneDNNPass, FuseIsTest) {
   auto prog = test::BuildProgramDesc({"x", "m", "v", "bn_y", "act_y"},
                                      {"scale", "bias"});
-  auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"},
-                                                     {"Scale", "scale"},
-                                                     {"Bias", "bias"},
-                                                     {"Mean", "m"},
-                                                     {"Variance", "v"}},
+  auto* bn_op = test::CreateOp(&prog, "batch_norm",
+                               {{"X", "x"},
+                                {"Scale", "scale"},
+                                {"Bias", "bias"},
+                                {"Mean", "m"},
+                                {"Variance", "v"}},
                                {{"Y", "bn_y"}});
   SetBatchNormAttrs(bn_op, true, false);
   test::CreateOp(&prog, "relu", {{"X", "bn_y"}}, {{"Out", "act_y"}}, false);
@@ -106,11 +108,12 @@ TEST(FuseBatchNormActOneDNNPass, ThrowTrainableStats) {
   auto prog = test::BuildProgramDesc(
       {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"},
       {"scale", "bias"});
-  auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"},
-                                                     {"Scale", "scale"},
-                                                     {"Bias", "bias"},
-                                                     {"Mean", "m"},
-                                                     {"Variance", "v"}},
+  auto* bn_op = test::CreateOp(&prog, "batch_norm",
+                               {{"X", "x"},
+                                {"Scale", "scale"},
+                                {"Bias", "bias"},
+                                {"Mean", "m"},
+                                {"Variance", "v"}},
                                {{"Y", "bn_y"},
                                 {"MeanOut", "m_out"},
                                 {"VarianceOut", "var_out"},
@@ -132,11 +135,12 @@ TEST(FuseBatchNormActOneDNNPass, AllAttrsFalse) {
   auto prog = test::BuildProgramDesc(
       {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"},
       {"scale", "bias"});
-  auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"},
-                                                     {"Scale", "scale"},
-                                                     {"Bias", "bias"},
-                                                     {"Mean", "m"},
-                                                     {"Variance", "v"}},
+  auto* bn_op = test::CreateOp(&prog, "batch_norm",
+                               {{"X", "x"},
+                                {"Scale", "scale"},
+                                {"Bias", "bias"},
+                                {"Mean", "m"},
+                                {"Variance", "v"}},
                                {{"Y", "bn_y"},
                                 {"MeanOut", "m_out"},
                                 {"VarianceOut", "var_out"},
@@ -158,11 +162,12 @@ TEST(FuseBatchNormActOneDNNPass, ThrowUseMkldnn) {
   auto prog = test::BuildProgramDesc(
       {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"},
       {"scale", "bias"});
-  auto* bn_op = test::CreateOp(&prog, "batch_norm", {{"X", "x"},
-                                                     {"Scale", "scale"},
-                                                     {"Bias", "bias"},
-                                                     {"Mean", "m"},
-                                                     {"Variance", "v"}},
+  auto* bn_op = test::CreateOp(&prog, "batch_norm",
+                               {{"X", "x"},
+                                {"Scale", "scale"},
+                                {"Bias", "bias"},
+                                {"Mean", "m"},
+                                {"Variance", "v"}},
                                {{"Y", "bn_y"},
                                 {"MeanOut", "m_out"},
                                 {"VarianceOut", "var_out"},
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
index d7d0b988b55..e19426d01d1 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h"
+
 #include <float.h>
+
 #include <algorithm>
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h"
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
index b0076c1b38c..26fb6e4978f 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
index 1fefab805b1..e3db8547176 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
 #include <vector>
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
index a74d7443ee1..18e09173491 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
@@ -23,8 +23,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 /*
-* Fuse the Conv and Elementwise_add to a ConvBiasOp.
-*/
+ * Fuse the Conv and Elementwise_add to a ConvBiasOp.
+ */
 class Graph;
 
 class ConvBiasFusePass : public FusePassBase {
@@ -38,8 +38,8 @@ class ConvBiasFusePass : public FusePassBase {
   const std::string name_scope_{"conv_bias_mkldnn_fuse"};
 };
 /*
-* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
-*/
+ * Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
+ */
 class Conv2DTransposeBiasFusePass : public ConvBiasFusePass {
  public:
   Conv2DTransposeBiasFusePass();
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
index e9850483ebe..0e052debaee 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/naive_executor.h"
-#include "paddle/fluid/platform/place.h"
 
+#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
index 6b648608ca1..7d165b1a38a 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
index 879c669bbbe..58eec79344d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -226,7 +226,7 @@ class DeQuantizer final : public Quanter {
     return Quanter::create_quant_op(output_name, input_name);
   }
 };
-}
+}  // namespace
 using string::PrettyLogDetail;
 
 void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index a61c043b580..452212664ec 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
+
 #include <sstream>
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/string/pretty_log.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 912c16288c2..fb36365ac54 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"  // NOLINT
 #include <gtest/gtest.h>
+
 #include <unordered_map>
 
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"  // NOLINT
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
index 350fad2c672..f6e5279ed23 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
-
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
index 06940b38ea8..979c601ac04 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
-
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
index b7f7a8071d2..2a8a248a99f 100644
--- a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
index 7fc8806452b..afcd493f92f 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc
index 59d81cb8647..4b158ccc5a8 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc
@@ -32,7 +32,9 @@ TEST(FuseFCActOneDNNPass, ThrowUseMkldnn) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}}, false);
   test::CreateOp(&prog, "gelu", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, false);
@@ -51,7 +53,9 @@ TEST(FuseFCActOneDNNPass, FuseWithGeluTanh) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   auto* act_op = test::CreateOp(&prog, "gelu", {{"Input", "fc_y"}},
@@ -83,7 +87,9 @@ TEST(FuseFCActOneDNNPass, FuseWithGeluErf) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   auto* act_op = test::CreateOp(&prog, "gelu", {{"Input", "fc_y"}},
@@ -115,7 +121,9 @@ TEST(FuseFCActOneDNNPass, FuseWithGeluAuto) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   test::CreateOp(&prog, "gelu", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, false);
@@ -145,7 +153,9 @@ TEST(FuseFCActOneDNNPass, FuseWithTanh) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   test::CreateOp(&prog, "tanh", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, false);
@@ -175,7 +185,9 @@ TEST(FuseFCActOneDNNPass, FuseWithSigmoid) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   test::CreateOp(&prog, "sigmoid", {{"Input", "fc_y"}}, {{"Out", "act_y"}},
@@ -206,7 +218,9 @@ TEST(FuseFCActOneDNNPass, FuseWithMish) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   test::CreateOp(&prog, "mish", {{"Input", "fc_y"}}, {{"Out", "act_y"}}, false);
@@ -236,7 +250,9 @@ TEST(FuseFCActOneDNNPass, FuseWithHardSwish) {
       test::BuildProgramDesc({"x", "fc_y", "act_y"}, {"weights", "bias"});
   test::CreateOp(&prog, "fc",
                  {
-                     {"Input", "x"}, {"Weights", "weights"}, {"Bias", "bias"},
+                     {"Input", "x"},
+                     {"Weights", "weights"},
+                     {"Bias", "bias"},
                  },
                  {{"Out", "fc_y"}});
   test::CreateOp(&prog, "hard_swish", {{"Input", "fc_y"}}, {{"Out", "act_y"}},
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
index 2e62597f2ee..60856512779 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/string/pretty_log.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
index 678a8fb4a69..a5481f5c6f3 100644
--- a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
@@ -129,17 +129,13 @@ void Int8ScaleCalculationMkldnnPass::ApplyImpl(ir::Graph* graph) const {
     bool has_activation =
         !conv_op->Op()->GetAttrIfExists<std::string>("fuse_activation").empty();
     float activation_scale =
-        force_fp32_output
-            ? 1.0f
-            : has_activation
-                  ? conv_op->Op()->GetAttrIfExists<float>("Scale_out")
-                  : 1.0f;
+        force_fp32_output ? 1.0f
+        : has_activation  ? conv_op->Op()->GetAttrIfExists<float>("Scale_out")
+                          : 1.0f;
     auto scale_out_data =
-        force_fp32_output
-            ? 1.0f
-            : has_activation
-                  ? 1.0f
-                  : conv_op->Op()->GetAttrIfExists<float>("Scale_out");
+        force_fp32_output ? 1.0f
+        : has_activation  ? 1.0f
+                          : conv_op->Op()->GetAttrIfExists<float>("Scale_out");
     float sum_scale =
         fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
 
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
index 804d04e35f6..9d3940c9664 100644
--- a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h"
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
index 4eb532b47cb..1ed36e06fb1 100644
--- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
index 34a35877a7f..f6c99a477bc 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
@@ -13,8 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h"
+
 #include <paddle/fluid/string/pretty_log.h>
+
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc
index ed99989cf38..ddb9e717392 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc
index dcf4664d963..6e106fa9dae 100644
--- a/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h"
+
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 4236dc55d51..06e0db4c93e 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <boost/logic/tribool.hpp>
 #include <random>
 #include <string>
 #include <unordered_set>
 
-#include <boost/logic/tribool.hpp>
-
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc
index c4770a322db..1ca9e76f79d 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h"
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h"
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc
index d2763bd6a6d..ae8dbceb7a6 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h
index 44b6d110db8..880630055e9 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <memory>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index 7df957b2c0e..7f4e5d32536 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
-
 #include <gtest/gtest.h>
-#include <unordered_set>
 
 #include <boost/logic/tribool.hpp>
+#include <unordered_set>
 
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
index 505bb2739e1..99a55b26e99 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
index 4012e04f7d2..671ad4c1c4b 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
-
 #include <gtest/gtest.h>
+
 #include <boost/logic/tribool.hpp>
 
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
index 76a0c883c89..73089df5717 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h"
+
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/platform/errors.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h
index 70f88104b4b..cf53ecec926 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass_tester.cc
index 7b6681ff967..60890336b30 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass_tester.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h"
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
index 7821501cc4b..06125e51fb6 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h"
+
 #include <limits>
 #include <sstream>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/platform/errors.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h
index 546a3d6570b..af58ae2bda4 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass_tester.cc
index 3738e3ebd68..2924401bc2e 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass_tester.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h"
 #include <gtest/gtest.h>
+
 #include <initializer_list>
 
+#include "paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 63e402cb529..15100b23407 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -124,10 +126,11 @@ void QuantDequantMkldnnPass::CollectInputScalesFromFake(
       auto* op_desc = op_node->Op();
       const int bit_length =
           BOOST_GET_CONST(int, op_desc->GetAttr("bit_length"));
-      PADDLE_ENFORCE_EQ(bit_length, 8, platform::errors::InvalidArgument(
-                                           "Unsupported number quantization "
-                                           "bits: %d, only 8 is supported now.",
-                                           bit_length));
+      PADDLE_ENFORCE_EQ(bit_length, 8,
+                        platform::errors::InvalidArgument(
+                            "Unsupported number quantization "
+                            "bits: %d, only 8 is supported now.",
+                            bit_length));
 
       auto x_var_name = op_desc->Input("X")[0];
       auto scale_name = op_desc->Input("InScale")[0];
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
index a9442f70740..5003e1878bf 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
index 96f575745a3..05b1d419f6f 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h"
+
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/pretty_log.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
index e6886356460..023dd6af7ee 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.h"
-
-#include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc
index 203966dc682..ed57be12c78 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.h"
+
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/pretty_log.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc
index 60f844ffc80..09bad959eb0 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h"
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
index bf603dc4bbc..a7e0f3a5834 100644
--- a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h"
+
 #include <string>
 
-#include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
index fe42e8f96f8..86775e20aa7 100644
--- a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <vector>
 
 #include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc
index 82d642264c2..cad92e3153b 100644
--- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc
index 003a39f37d4..662dfb0f9d4 100644
--- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
 #include <vector>
+
+#include "paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
@@ -52,43 +53,27 @@ void MainTest(const std::string& activation_type) {
   }
 }
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithTanh) {
-  MainTest("tanh")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithTanh){MainTest("tanh")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithRelu) {
-  MainTest("relu")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithRelu){MainTest("relu")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithLeakyRelu) {
-  MainTest("leaky_relu")
-}
+TEST(FuseSoftplusActivationOneDNNPass,
+     FuseSoftplusWithLeakyRelu){MainTest("leaky_relu")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSwish) {
-  MainTest("swish")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSwish){MainTest("swish")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithHardswish) {
-  MainTest("hardswish")
-}
+TEST(FuseSoftplusActivationOneDNNPass,
+     FuseSoftplusWithHardswish){MainTest("hardswish")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSqrt) {
-  MainTest("sqrt")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSqrt){MainTest("sqrt")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithAbs) { MainTest("abs") }
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithAbs){MainTest("abs")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithClip) {
-  MainTest("clip")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithClip){MainTest("clip")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithGelu) {
-  MainTest("gelu")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithGelu){MainTest("gelu")}
 
-TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithRelu6) {
-  MainTest("relu6")
-}
+TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithRelu6){MainTest("relu6")}
 
 TEST(FuseSoftplusActivationOneDNNPass, FuseSoftplusWithSigmoid) {
   MainTest("sigmoid")
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index 06af5eaec13..b849076935a 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/multi_batch_merge_pass.h"
 
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc
index abb1d062c96..b907869b4a3 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/add_reader_dependency_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <queue>
+
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc
index 772b4c1c915..55b6389768c 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fix_op_run_order_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
index 484d09fd444..5189f410e3c 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <string>
+
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 1b6245928d3..7180c3820c7 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
+
 #include <algorithm>
 #include <fstream>
 #include <memory>
@@ -20,6 +21,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
@@ -495,9 +497,9 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
                         "use_dgc=%d, use_grad_merge=%d",
                         is_encoded, is_grad_merge));
 
-  auto append_allreduce_op = [&](
-      const std::vector<Scope *> &scopes,
-      const std::vector<platform::Place> &places) -> details::OpHandleBase * {
+  auto append_allreduce_op = [&](const std::vector<Scope *> &scopes,
+                                 const std::vector<platform::Place> &places)
+      -> details::OpHandleBase * {
     if (is_encoded) {
 #if defined(PADDLE_WITH_DGC) && \
     (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
@@ -758,13 +760,14 @@ int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
           "and Parameter@Grad.",
           node->Name(), OpProtoAndCheckerMaker::OpRoleVarAttrName()));
   int dev_id = GetVarDeviceID(param_grad[1]);
-  PADDLE_ENFORCE_NE(dev_id, -1, platform::errors::NotFound(
-                                    "Can not find Device ID, for NodeName:%s, "
-                                    "NodeType:%s, Param:%s, Param@Grad:%s"
-                                    "For this fault, you can consult the "
-                                    "Paddle technical personnel for answer ",
-                                    node->Name(), node->Op()->Type(),
-                                    param_grad[0], param_grad[1]));
+  PADDLE_ENFORCE_NE(
+      dev_id, -1,
+      platform::errors::NotFound("Can not find Device ID, for NodeName:%s, "
+                                 "NodeType:%s, Param:%s, Param@Grad:%s"
+                                 "For this fault, you can consult the "
+                                 "Paddle technical personnel for answer ",
+                                 node->Name(), node->Op()->Type(),
+                                 param_grad[0], param_grad[1]));
   return dev_id;
 }
 
@@ -956,10 +959,11 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
   bool insert_op = false;
   if (OpHaveRole(*node, OpRole::kRPC)) {
     int op_dev_id = CreateRPCOp(result, node);
-    PADDLE_ENFORCE_NE(op_dev_id, -1, platform::errors::InvalidArgument(
-                                         "Can not schedule the RPC operator to "
-                                         "the right place. NodeName:%s.",
-                                         node->Name()));
+    PADDLE_ENFORCE_NE(op_dev_id, -1,
+                      platform::errors::InvalidArgument(
+                          "Can not schedule the RPC operator to "
+                          "the right place. NodeName:%s.",
+                          node->Name()));
     if (node->Op()->Type() == "recv") {
       auto recv_vars_attr =
           BOOST_GET_CONST(std::vector<std::string>,
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
index c76f3001676..75080742077 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
@@ -46,7 +46,7 @@ class NCCLContextMap;
 class BKCLContextMap;
 class BKCLCommunicator;
 #endif
-}
+}  // namespace platform
 
 namespace framework {
 class Scope;
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc
index 09ef94c0826..c7b6e477fd5 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h"
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 4a594777805..03d433f4db1 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -51,11 +51,12 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](
-      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
-      Node* mul1_out, Node* mul2_out, Node* eltadd0_b, Node* eltadd1_b,
-      Node* eltadd2_b, Node* eltadd_qk_b, Node* reshape2,
-      Node* reshape2_qkv_out, Node* scale, Node* scale_out) {
+  auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2,
+                          Node* mul0_out, Node* mul1_out, Node* mul2_out,
+                          Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b,
+                          Node* eltadd_qk_b, Node* reshape2,
+                          Node* reshape2_qkv_out, Node* scale,
+                          Node* scale_out) {
     auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale"));
     // auto scale_bias = BOOST_GET_CONST(float, scale->Op()->GetAttr("bias"));
     // bool after_scale =
@@ -756,13 +757,14 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](
-      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
-      Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w,
-      Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b,
-      Node* reshape2, Node* reshape2_qkv_out, Node* scale, Node* scale_out,
-      Node* softmax_qk, Node* eltadd0, Node* eltadd1, Node* eltadd2,
-      Node* matmul_qk, Node* reshape2_qkv) {
+  auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2,
+                          Node* mul0_out, Node* mul1_out, Node* mul2_out,
+                          Node* mul0_w, Node* mul1_w, Node* mul2_w,
+                          Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b,
+                          Node* eltadd_qk_b, Node* reshape2,
+                          Node* reshape2_qkv_out, Node* scale, Node* scale_out,
+                          Node* softmax_qk, Node* eltadd0, Node* eltadd1,
+                          Node* eltadd2, Node* matmul_qk, Node* reshape2_qkv) {
     auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale"));
 
     // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
@@ -1207,11 +1209,12 @@ int MultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph,
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](
-      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
-      Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w,
-      Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b,
-      Node* reshape2, Node* reshape2_qkv_out, Node* matmul_qk) {
+  auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2,
+                          Node* mul0_out, Node* mul1_out, Node* mul2_out,
+                          Node* mul0_w, Node* mul1_w, Node* mul2_w,
+                          Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b,
+                          Node* eltadd_qk_b, Node* reshape2,
+                          Node* reshape2_qkv_out, Node* matmul_qk) {
     auto scale_attr = BOOST_GET_CONST(float, matmul_qk->Op()->GetAttr("alpha"));
 
     // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
index b121436ee87..858ebf68b40 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
@@ -9,8 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h"  // NOLINT
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h"  // NOLINT
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/framework/ir/node_test.cc b/paddle/fluid/framework/ir/node_test.cc
index 9c47df402bd..2d84162e13a 100644
--- a/paddle/fluid/framework/ir/node_test.cc
+++ b/paddle/fluid/framework/ir/node_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/node.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/var_desc.h"
 
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
index 73a8691f9e2..e309e068563 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
+
 #include <memory>
 #include <mutex>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/op_def_api.h"
 #include "paddle/fluid/framework/op_info.h"
 
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.h b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
index e24294a03a2..393a2fb9392 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.h
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <map>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
index 756d3c2c770..4b106d75f1c 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 2c10a68188e..85eecbd014e 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/pass.h"
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
@@ -90,9 +91,10 @@ static void MergePrograms(ProgramDesc *dst, const details::ProgramDescs &srcs,
   bool reverse = !append;
 
   auto create_var_visitor = [dst](const ProgramDesc &src) {
-    PADDLE_ENFORCE_EQ(src.Size(), 1, platform::errors::Unimplemented(
-                                         "MergePrograms can only support to "
-                                         "merge program with only one block."));
+    PADDLE_ENFORCE_EQ(
+        src.Size(), 1,
+        platform::errors::Unimplemented("MergePrograms can only support to "
+                                        "merge program with only one block."));
     const auto &src_block = src.Block(0);
     auto *dst_block = dst->MutableBlock(0);
     for (const auto *src_new_var : src_block.AllVars()) {
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
index 616ba7f1a97..8c368a796ed 100644
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -84,8 +84,9 @@ TEST(PassTest, TestPassAttrCheck) {
     } catch (paddle::platform::EnforceNotMet& e) {
       exception = std::string(e.what());
     }
-    std::string msg = "Invalid type for attritube test_pass_attr, expected: " +
-                      try_type + ", actual: int";
+    std::string msg =
+        "Invalid type for attritube test_pass_attr, expected: " + try_type +
+        ", actual: int";
     ASSERT_TRUE(exception.find(msg) != exception.npos);
   }
 
@@ -168,8 +169,9 @@ TEST(PassTest, TestPassAttrCheckConvertAllBlocks) {
     } catch (paddle::platform::EnforceNotMet& e) {
       exception = std::string(e.what());
     }
-    std::string msg = "Invalid type for attritube test_pass_attr, expected: " +
-                      try_type + ", actual: int";
+    std::string msg =
+        "Invalid type for attritube test_pass_attr, expected: " + try_type +
+        ", actual: int";
     ASSERT_TRUE(exception.find(msg) != exception.npos);
   }
 
diff --git a/paddle/fluid/framework/ir/pass_test_util.cc b/paddle/fluid/framework/ir/pass_test_util.cc
index 4d8965918f8..40dcb3cf1db 100644
--- a/paddle/fluid/framework/ir/pass_test_util.cc
+++ b/paddle/fluid/framework/ir/pass_test_util.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/pass_test_util.h"
+
 #include <algorithm>
 #include <cstring>
 #include <exception>
@@ -23,7 +25,6 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/ir/pass_test_util.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
diff --git a/paddle/fluid/framework/ir/pass_tester_helper.h b/paddle/fluid/framework/ir/pass_tester_helper.h
index acefde9df68..ad58e4e4a0c 100644
--- a/paddle/fluid/framework/ir/pass_tester_helper.h
+++ b/paddle/fluid/framework/ir/pass_tester_helper.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc
index 35ba9200607..fd1b54f8c4d 100644
--- a/paddle/fluid/framework/ir/placement_pass_base.cc
+++ b/paddle/fluid/framework/ir/placement_pass_base.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/placement_pass_base.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
index 6c06b741adb..80e6c2b7967 100644
--- a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
@@ -43,8 +43,8 @@ struct PrelnSkipLayerNorm : public PatternBase {
   PATTERN_DECL_NODE(layer_norm);
   // declare variable node's name
   PATTERN_DECL_NODE(
-      elementwise_out);  // (elementwise_input_x,elementwise_input_y) ->
-                         // elementwise_out
+      elementwise_out);  // (elementwise_input_x,elementwise_input_y)
+                         // -> elementwise_out
   PATTERN_DECL_NODE(layer_norm_bias);
   PATTERN_DECL_NODE(layer_norm_scale);
   PATTERN_DECL_NODE(layer_norm_out);
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index a03a6f5b2c7..a2dd846ba52 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -145,9 +146,9 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
     return x->outputs[fc_idx]->outputs[0];
   };
 
-  auto var_next_is_fc_act_repeated_n_times = [=](
-      Node* x, int repeated_times, const std::string& act_type = "relu",
-      bool check_in_has_only_one_out = true) -> bool {
+  auto var_next_is_fc_act_repeated_n_times =
+      [=](Node* x, int repeated_times, const std::string& act_type = "relu",
+          bool check_in_has_only_one_out = true) -> bool {
     for (int i = 0; i < repeated_times; ++i) {
       if (!var_next_is_fc_act(x, act_type,
                               i == 0 && check_in_has_only_one_out)) {
@@ -191,9 +192,9 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
     return nullptr;
   };
 
-  auto var_before_is_fc_act_repeated_n_times = [=](
-      Node* x, int repeated_times,
-      const std::string& act_type = "relu") -> bool {
+  auto var_before_is_fc_act_repeated_n_times = [=](Node* x, int repeated_times,
+                                                   const std::string& act_type =
+                                                       "relu") -> bool {
     for (int i = 0; i < repeated_times; ++i) {
       if (!var_before_is_fc_act(x, act_type, i == repeated_times - 1)) {
         return false;
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
index f0ff77acf9f..3112b776ae5 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
index 778e658354f..451e41e767d 100644
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/runtime_context_cache_pass.h"
+
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
index 9fa951920f4..2c0b142c98f 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
index 2b084bd5734..052b0a4bdc1 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
@@ -44,8 +44,8 @@ PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
            is_concat_op_with_inputs(x->outputs[0], num_inputs);
   };
 
-  auto is_seqpool_op_with_pootype_of_nth_input_of_concat = [=](
-      Node* x, const std::string& type, int idx) -> bool {
+  auto is_seqpool_op_with_pootype_of_nth_input_of_concat =
+      [=](Node* x, const std::string& type, int idx) -> bool {
     bool this_is_seqpool_op =
         x && x->IsOp() && x->Op()->Type() == "sequence_pool" &&
         x->Op()->HasAttr("pooltype") &&
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
index d3668038518..e56ba9ad1e7 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
index 7200e0ac1d4..916adbbe337 100644
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc
@@ -44,11 +44,11 @@ static void GetConcatNodes(ir::Graph* graph, std::vector<Node*>* concat_nodes) {
   GraphPatternDetector gpd;
   auto* pattern = gpd.mutable_pattern();
   auto concat_op_node = BuildCVMConcatPattern(pattern);
-  GraphPatternDetector::handle_t handler = [&](
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-    Node* concat_op = subgraph.at(concat_op_node);
-    concat_nodes->push_back(concat_op);
-  };
+  GraphPatternDetector::handle_t handler =
+      [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+        Node* concat_op = subgraph.at(concat_op_node);
+        concat_nodes->push_back(concat_op);
+      };
   gpd(graph, handler);
 }
 }  // anonymous namespace
@@ -148,19 +148,19 @@ void SeqPoolCVMConcatFusePass::ApplyImpl(ir::Graph* graph) const {
     Node* cvm_input_of_cvm;
     Node* concat_out_var = concat_node->outputs[0];
 
-    GraphPatternDetector::handle_t handler = [&](
-        const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
-      Node* seqpool_in_var = subgraph.at(seqpool_in_var_node);
-      Node* seqpool_op = subgraph.at(seqpool_op_node);
-      Node* seqpool_out_var = subgraph.at(seqpool_out_var_node);
-      Node* seqpool_idx_out_var = subgraph.at(seqpool_idx_out_var_node);
-      Node* cvm_op = subgraph.at(cvm_op_node);
-      Node* cvm_out_var = subgraph.at(cvm_out_var_node);
-      cvm_input_of_cvm = subgraph.at(cvm_cvm_in_var_node);
-      marked_nodes.insert({seqpool_op, seqpool_out_var, seqpool_idx_out_var,
-                           cvm_op, cvm_out_var, concat_node});
-      ins_to_concat[cvm_out_var->Name()] = seqpool_in_var;
-    };
+    GraphPatternDetector::handle_t handler =
+        [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
+          Node* seqpool_in_var = subgraph.at(seqpool_in_var_node);
+          Node* seqpool_op = subgraph.at(seqpool_op_node);
+          Node* seqpool_out_var = subgraph.at(seqpool_out_var_node);
+          Node* seqpool_idx_out_var = subgraph.at(seqpool_idx_out_var_node);
+          Node* cvm_op = subgraph.at(cvm_op_node);
+          Node* cvm_out_var = subgraph.at(cvm_out_var_node);
+          cvm_input_of_cvm = subgraph.at(cvm_cvm_in_var_node);
+          marked_nodes.insert({seqpool_op, seqpool_out_var, seqpool_idx_out_var,
+                               cvm_op, cvm_out_var, concat_node});
+          ins_to_concat[cvm_out_var->Name()] = seqpool_in_var;
+        };
     gpd(graph, handler);
 
     if (!ins_to_concat.empty()) {
diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
index bba640cf148..8d8ebc955d3 100644
--- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h"
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
index bcd7bedcc43..9007105950b 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
+
 #include <string>
 
-#include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
index 80f387c4427..908797163d2 100644
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h"
-
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index bfa14d9296b..6bebe8de9f2 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -43,8 +43,8 @@ struct SkipLayerNorm : public PatternBase {
   PATTERN_DECL_NODE(layer_norm);
   // declare variable node's name
   PATTERN_DECL_NODE(
-      elementwise_out);  // (elementwise_input_x,elementwise_input_y) ->
-                         // elementwise_out
+      elementwise_out);  // (elementwise_input_x,elementwise_input_y)
+                         // -> elementwise_out
   PATTERN_DECL_NODE(layer_norm_bias);
   PATTERN_DECL_NODE(layer_norm_scale);
   PATTERN_DECL_NODE(layer_norm_out);
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
index 29be2c3cb09..c95fd0abd52 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
index 7c43b022182..a8c7150d6e3 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
@@ -170,8 +170,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto* matmul_xy_op = pattern->NewNode(
       [=](Node* x) {
-        return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
-                                  x->Op()->Type() == "matmul") &&
+        return x && x->IsOp() &&
+               (x->Op()->Type() == "matmul_v2" ||
+                x->Op()->Type() == "matmul") &&
                is_fusion_first_mul_out(x->outputs[0]);
       },
       name_scope + "/matmul_xy_op");
@@ -212,8 +213,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto* matmul_squared_x_y_op = pattern->NewNode(
       [=](Node* x) {
-        return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
-                                  x->Op()->Type() == "matmul") &&
+        return x && x->IsOp() &&
+               (x->Op()->Type() == "matmul_v2" ||
+                x->Op()->Type() == "matmul") &&
                is_fusion_mat_squared_x_y_op_out(x->outputs[0]);
       },
       name_scope + "/matmul_squared_x_y_op");
diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
index 94fb6850641..78dafaa1e2f 100644
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <string>
 
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index bda6b903864..6802310383d 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc
index d3211c08414..a6e3780fd22 100644
--- a/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc
@@ -16,9 +16,9 @@
 
 #include <cmath>
 #include <string>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
-
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
index 798a038f767..2e3e957fd15 100644
--- a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
@@ -51,11 +51,12 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](
-      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
-      Node* mul1_out, Node* mul2_out, Node* eltadd0_b, Node* eltadd1_b,
-      Node* eltadd2_b, Node* eltadd_qk_b, Node* reshape2,
-      Node* reshape2_qkv_out, Node* scale, Node* scale_out) {
+  auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2,
+                          Node* mul0_out, Node* mul1_out, Node* mul2_out,
+                          Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b,
+                          Node* eltadd_qk_b, Node* reshape2,
+                          Node* reshape2_qkv_out, Node* scale,
+                          Node* scale_out) {
     auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale"));
     // auto scale_bias = BOOST_GET_CONST(float, scale->Op()->GetAttr("bias"));
     // bool after_scale =
@@ -756,13 +757,14 @@ int TrtMultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](
-      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
-      Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w,
-      Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b,
-      Node* reshape2, Node* reshape2_qkv_out, Node* scale, Node* scale_out,
-      Node* softmax_qk, Node* eltadd0, Node* eltadd1, Node* eltadd2,
-      Node* matmul_qk, Node* reshape2_qkv) {
+  auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2,
+                          Node* mul0_out, Node* mul1_out, Node* mul2_out,
+                          Node* mul0_w, Node* mul1_w, Node* mul2_w,
+                          Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b,
+                          Node* eltadd_qk_b, Node* reshape2,
+                          Node* reshape2_qkv_out, Node* scale, Node* scale_out,
+                          Node* softmax_qk, Node* eltadd0, Node* eltadd1,
+                          Node* eltadd2, Node* matmul_qk, Node* reshape2_qkv) {
     auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale"));
 
     // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
@@ -1229,11 +1231,12 @@ int TrtMultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph,
 
   multihead_pattern();
   // Create New OpDesc
-  auto fuse_creater = [&](
-      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
-      Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w,
-      Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b,
-      Node* reshape2, Node* reshape2_qkv_out, Node* matmul_qk) {
+  auto fuse_creater = [&](Node* input0, Node* mul0, Node* mul1, Node* mul2,
+                          Node* mul0_out, Node* mul1_out, Node* mul2_out,
+                          Node* mul0_w, Node* mul1_w, Node* mul2_w,
+                          Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b,
+                          Node* eltadd_qk_b, Node* reshape2,
+                          Node* reshape2_qkv_out, Node* matmul_qk) {
     auto scale_attr = BOOST_GET_CONST(float, matmul_qk->Op()->GetAttr("alpha"));
 
     // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
diff --git a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
index 53452d4239a..13883909435 100644
--- a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
@@ -43,8 +43,8 @@ struct TrtSkipLayerNorm : public PatternBase {
   PATTERN_DECL_NODE(layer_norm);
   // declare variable node's name
   PATTERN_DECL_NODE(
-      elementwise_out);  // (elementwise_input_x,elementwise_input_y) ->
-                         // elementwise_out
+      elementwise_out);  // (elementwise_input_x,elementwise_input_y)
+                         // -> elementwise_out
   PATTERN_DECL_NODE(layer_norm_bias);
   PATTERN_DECL_NODE(layer_norm_scale);
   PATTERN_DECL_NODE(layer_norm_out);
diff --git a/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass_tester.cc b/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass_tester.cc
index 067a37c611a..3ebd61ff575 100644
--- a/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass_tester.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.h"
-
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
index 20075a49749..19836b69ae9 100644
--- a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/yolo_box_fuse_pass.h"
+
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 1c5c12b3d57..dd316a0979c 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <glog/logging.h>
+
 #include <memory>
 #include <string>
 #include <utility>
diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h
index 36a5c3c5d60..7aa180ed75c 100644
--- a/paddle/fluid/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index a89baac3e7a..254e70231ea 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/lod_tensor.h"
+
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/phi/core/lod_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index dba3b3ff1e6..1c2740c2b2e 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/naive_executor.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/denormal.h"
diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc
index 2f3c3f3d06e..763e314d226 100644
--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
@@ -13,8 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/naive_executor.h"
+
 #include <gtest/gtest.h>
+
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index d0e5565139c..171e15162fb 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/data_transfer.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 
 namespace paddle {
@@ -276,9 +277,9 @@ std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
   // 2. Construct VariableNameMap
   VariableNameMap in_name_map = {{"X", {var_name}}};
   VariableNameMap out_name_map = {{"Out", {*new_var_name}}};
-  int dst_place_type = platform::is_cpu_place(dst_place)
-                           ? 0
-                           : platform::is_gpu_place(dst_place) ? 1 : -1;
+  int dst_place_type = platform::is_cpu_place(dst_place)   ? 0
+                       : platform::is_gpu_place(dst_place) ? 1
+                                                           : -1;
   AttributeMap attr_map = {{"dst_place_type", dst_place_type}};
 
   // 3. Create memcpy_d2h_op or memcpy_h2d_op
diff --git a/paddle/fluid/framework/new_executor/event_manager.cc b/paddle/fluid/framework/new_executor/event_manager.cc
index bca2264b66a..0bfa00494d6 100644
--- a/paddle/fluid/framework/new_executor/event_manager.cc
+++ b/paddle/fluid/framework/new_executor/event_manager.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/event_manager.h"
+
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc
index fb79712d47d..f6afcf2f24d 100644
--- a/paddle/fluid/framework/new_executor/executor_statistics.cc
+++ b/paddle/fluid/framework/new_executor/executor_statistics.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/executor_statistics.h"
+
 #include <fstream>
 #include <functional>
 #include <map>
@@ -21,6 +22,7 @@
 #include <set>
 #include <unordered_map>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/os_info.h"
@@ -520,7 +522,7 @@ void StatisticsEngine::MergeEvents(std::function<size_t(size_t, size_t)> merger,
 
 int StatisticsEngine::MergeInnerthreadEvents(
     std::vector<std::vector<StdEvent>>* all_evts) {
-  auto merger = [& priorities = priorities_](size_t idx1, size_t idx2) {
+  auto merger = [&priorities = priorities_](size_t idx1, size_t idx2) {
     return priorities[idx1].innerthread_priority <=
                    priorities[idx2].innerthread_priority
                ? idx1
@@ -541,7 +543,7 @@ int StatisticsEngine::MergeInnerthreadEvents(
 
 int StatisticsEngine::MergeInterthreadEvents(
     std::vector<std::vector<StdEvent>>* all_evts) {
-  auto merger = [& priorities = priorities_](size_t idx1, size_t idx2) {
+  auto merger = [&priorities = priorities_](size_t idx1, size_t idx2) {
     return priorities[idx1].interthread_priority <=
                    priorities[idx2].interthread_priority
                ? idx1
diff --git a/paddle/fluid/framework/new_executor/executor_statistics.h b/paddle/fluid/framework/new_executor/executor_statistics.h
index 530e9455968..ebe9d3a2e79 100644
--- a/paddle/fluid/framework/new_executor/executor_statistics.h
+++ b/paddle/fluid/framework/new_executor/executor_statistics.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+
 #include "paddle/fluid/platform/profiler/event_node.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
index 46c85a22dc3..1ae9f4223d3 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
@@ -110,7 +110,7 @@ void InterpreterCoreEventGarbageCollector::Free(
     const platform::DeviceContext* ctx) {
   event->Record(ctx);
   event->SetFininshed();  // Only for CPU Event
-  queue_->AddTask([ container = garbages, event = event ]() {
+  queue_->AddTask([container = garbages, event = event]() {
     while (!event->Query()) {
 #if defined(_WIN32)
       SleepEx(50, FALSE);
@@ -128,7 +128,7 @@ void InterpreterCoreEventGarbageCollector::Free(
     const platform::DeviceContext* ctx) {
   event->Record(ctx);
   event->SetFininshed();  // Only for CPU Event
-  queue_->AddTask([ container = garbage, event = event ]() {
+  queue_->AddTask([container = garbage, event = event]() {
     while (!event->Query()) {
 #if defined(_WIN32)
       SleepEx(50, FALSE);
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h
index 33954713d4e..57963269663 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h
+++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <queue>
+
 #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue.h"
 
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
index a20cd275398..8e849c79bd2 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h"
+
 #include "paddle/fluid/framework/garbage_collector.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h
index 34f95eee731..d0159c0ca83 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h
+++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <queue>
+
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/device_event.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index da2fd0c8c61..fe0c7fe0721 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
+
 #include <unordered_set>
+
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
 #include "paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h"
@@ -585,10 +587,12 @@ void InterpreterCore::ExecuteInstructionList(
 
   for (size_t i = 0; i < dependecy_count_.size(); ++i) {
     if (dependecy_count_[i] == 0) {
-      async_work_queue_->AddTask(vec_instr.at(i).KernelType(), [
-        this, i, atomic_deps = atomic_deps.get(),
-        atomic_var_ref = atomic_var_ref.get()
-      ] { RunInstructionAsync(i, atomic_deps, atomic_var_ref); });
+      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
+                                 [this, i, atomic_deps = atomic_deps.get(),
+                                  atomic_var_ref = atomic_var_ref.get()] {
+                                   RunInstructionAsync(i, atomic_deps,
+                                                       atomic_var_ref);
+                                 });
     }
   }
 
@@ -692,10 +696,10 @@ void InterpreterCore::RunInstructionAsync(
     ready_ops.pop();
     auto& instr_node = vec_instruction_.at(instr_id);
     VLOG(5) << __func__ << " OP id:" << instr_node.Id()
-            << " name:" << instr_node.OpBase()->Type()
-            << " type:" << (instr_node.KernelType() == OpFuncType::kQueueSync
-                                ? "kQueueSync"
-                                : "kQueueAsync")
+            << " name:" << instr_node.OpBase()->Type() << " type:"
+            << (instr_node.KernelType() == OpFuncType::kQueueSync
+                    ? "kQueueSync"
+                    : "kQueueAsync")
             << " runs on " << platform::GetCurrentThreadName();
 
     auto* op = instr_node.OpBase();
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index f601a4ad28b..0b75964b94e 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
+
 #include <algorithm>
 
 #include "paddle/fluid/framework/executor_gc_helper.h"
@@ -398,9 +399,10 @@ void build_op_func_list(const platform::Place& place,
       // But some OPs do have such behavior (e.g., cinn_launch OP). Here special
       // treatment for them.
       if (op_with_kernel->Type() == "cinn_launch") {
-        VLOG(6) << "OP(" << op_with_kernel->Type() << ") use scope in kernel, "
-                                                      "so pass a real scope to "
-                                                      "ExecutionContext";
+        VLOG(6) << "OP(" << op_with_kernel->Type()
+                << ") use scope in kernel, "
+                   "so pass a real scope to "
+                   "ExecutionContext";
         runtime_scope = local_scope;
       }
 
@@ -747,8 +749,9 @@ std::map<int, std::list<int>> get_downstream_map(
 std::map<int, std::list<int>> build_op_downstream_map(
     const std::vector<Instruction>& vec_instruction,
     std::vector<std::vector<bool>>* op_happens_before) {
-  auto var2min_rw_op = std::map<
-      int, std::list<int>>();  // # map from variable id to read / write op id.
+  auto var2min_rw_op =
+      std::map<int, std::list<int>>();  // # map from variable id to read /
+                                        // write op id.
   auto var2recent_write_op =
       std::map<int, int>();  // # map from variable to recent write op.
   auto op2dependences =
@@ -825,8 +828,14 @@ std::map<int, std::list<int>> build_op_downstream_map(
   // add dependences for random op, make sure that the random op is scheduled
   // sequentially
   const std::set<std::string> random_op_set = {
-      "bernoulli", "poisson", "multinomial", "gaussian_random",
-      "truncated_gaussian_random", "uniform_random", "randint", "randperm",
+      "bernoulli",
+      "poisson",
+      "multinomial",
+      "gaussian_random",
+      "truncated_gaussian_random",
+      "uniform_random",
+      "randint",
+      "randperm",
       "exponential",
       "sampling_id"
       "dropout",
@@ -846,7 +855,10 @@ std::map<int, std::list<int>> build_op_downstream_map(
   // add dependency for communication op
   auto is_comm_op = [](std::string op) -> bool {
     const std::set<std::string> special_comm_op_set = {
-        "send", "recv", "send_v2", "recv_v2",
+        "send",
+        "recv",
+        "send_v2",
+        "recv_v2",
     };
     const std::string communication_op_prefix = "c_";
     if (op.find(communication_op_prefix) != std::string::npos ||
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index 60ac3702f4b..3d5b067c187 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -22,10 +22,9 @@
 
 #include <chrono>
 #include <iostream>
-#include <string>
-
 #include <map>
 #include <memory>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index c75a7871d63..1a4dd2edf27 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+
 #include <map>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
 // When in inference scenario, the scopes will not be written by two threads in
@@ -385,10 +386,11 @@ InterpretercoreInferShapeContext::GetOutputsVarType(
 void InterpretercoreInferShapeContext::SetOutputDim(const std::string& name,
                                                     const DDim& dim) {
   auto& vars = OutputVars(name);
-  PADDLE_ENFORCE_EQ(vars.size(), 1UL, platform::errors::InvalidArgument(
-                                          "Output(%s) should hold one element, "
-                                          "but now it holds %zu elements.",
-                                          name, vars.size()));
+  PADDLE_ENFORCE_EQ(
+      vars.size(), 1UL,
+      platform::errors::InvalidArgument("Output(%s) should hold one element, "
+                                        "but now it holds %zu elements.",
+                                        name, vars.size()));
   SetDim(vars[0], dim);
 }
 
@@ -653,8 +655,9 @@ void VariableScope::CheckExist(int id) const {
 }
 
 void VariableScope::CheckExist(const std::string& name) const {
-  PADDLE_ENFORCE_EQ(HasVar(name), true, platform::errors::NotFound(
-                                            "%s not in VariableScope.", name));
+  PADDLE_ENFORCE_EQ(
+      HasVar(name), true,
+      platform::errors::NotFound("%s not in VariableScope.", name));
 }
 
 void VariableScope::ClearListener() {
@@ -709,8 +712,9 @@ void VariableScopeListener::onClear() {}
 Instruction::Instruction(size_t id, OpFuncNode&& op_func_node,
                          const platform::DeviceContext& dev_ctx)
     : id_(id), op_func_node_(op_func_node), dev_ctx_(dev_ctx) {
-  PADDLE_ENFORCE_GE(id, 0, platform::errors::PreconditionNotMet(
-                               "Required id >= 0, but received id = %d", id));
+  PADDLE_ENFORCE_GE(id, 0,
+                    platform::errors::PreconditionNotMet(
+                        "Required id >= 0, but received id = %d", id));
 }
 
 size_t Instruction::Id() const { return id_; }
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 31315df5701..64332d7fc90 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/new_executor/standalone_executor.h"
+
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 23bd777fae1..60d59899549 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <chrono>
 #include <iostream>
 #include <string>
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc
index fdcd19b0309..6c689c8548b 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/stream_analyzer.h"
+
 #include <unordered_set>
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.h b/paddle/fluid/framework/new_executor/stream_analyzer.h
index 2a276c6f509..8a6552c6883 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.h
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/device_event.h"
diff --git a/paddle/fluid/framework/new_executor/workqueue/event_count.h b/paddle/fluid/framework/new_executor/workqueue/event_count.h
index 7a826c39907..7c20e12ff1f 100644
--- a/paddle/fluid/framework/new_executor/workqueue/event_count.h
+++ b/paddle/fluid/framework/new_executor/workqueue/event_count.h
@@ -54,6 +54,7 @@
 #include <cstdlib>
 #include <mutex>
 #include <vector>
+
 #include "glog/logging.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc b/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc
index 346e20d811e..dbe609427ad 100644
--- a/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/workqueue/events_waiter.h"
+
 #include <glog/logging.h>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/workqueue/events_waiter.h b/paddle/fluid/framework/new_executor/workqueue/events_waiter.h
index 9d85f4a2724..9284ffa853a 100644
--- a/paddle/fluid/framework/new_executor/workqueue/events_waiter.h
+++ b/paddle/fluid/framework/new_executor/workqueue/events_waiter.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/new_executor/workqueue/event_count.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
index 559eb6a7490..20aebfba8e8 100644
--- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
@@ -12,6 +12,7 @@
 #include <atomic>
 #include <cstdlib>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/new_executor/workqueue/event_count.h"
 #include "paddle/fluid/framework/new_executor/workqueue/run_queue.h"
diff --git a/paddle/fluid/framework/new_executor/workqueue/run_queue.h b/paddle/fluid/framework/new_executor/workqueue/run_queue.h
index 2fc42cf308a..7644425a484 100644
--- a/paddle/fluid/framework/new_executor/workqueue/run_queue.h
+++ b/paddle/fluid/framework/new_executor/workqueue/run_queue.h
@@ -42,6 +42,7 @@
 #include <cstdint>
 #include <mutex>
 #include <vector>
+
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 
@@ -76,9 +77,8 @@ class RunQueue {
     unsigned front = front_.load(std::memory_order_relaxed);
     Elem* e = &array_[front & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
-    if (s != kEmpty ||
-        !e->state.compare_exchange_strong(s, kBusy,
-                                          std::memory_order_acquire)) {
+    if (s != kEmpty || !e->state.compare_exchange_strong(
+                           s, kBusy, std::memory_order_acquire)) {
       return w;
     }
     front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
@@ -93,9 +93,8 @@ class RunQueue {
     unsigned front = front_.load(std::memory_order_relaxed);
     Elem* e = &array_[(front - 1) & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
-    if (s != kReady ||
-        !e->state.compare_exchange_strong(s, kBusy,
-                                          std::memory_order_acquire)) {
+    if (s != kReady || !e->state.compare_exchange_strong(
+                           s, kBusy, std::memory_order_acquire)) {
       return Work();
     }
     Work w = std::move(e->w);
@@ -112,9 +111,8 @@ class RunQueue {
     unsigned back = back_.load(std::memory_order_relaxed);
     Elem* e = &array_[(back - 1) & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
-    if (s != kEmpty ||
-        !e->state.compare_exchange_strong(s, kBusy,
-                                          std::memory_order_acquire)) {
+    if (s != kEmpty || !e->state.compare_exchange_strong(
+                           s, kBusy, std::memory_order_acquire)) {
       return w;
     }
     back = ((back - 1) & kMask2) | (back & ~kMask2);
@@ -134,9 +132,8 @@ class RunQueue {
     unsigned back = back_.load(std::memory_order_relaxed);
     Elem* e = &array_[back & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
-    if (s != kReady ||
-        !e->state.compare_exchange_strong(s, kBusy,
-                                          std::memory_order_acquire)) {
+    if (s != kReady || !e->state.compare_exchange_strong(
+                           s, kBusy, std::memory_order_acquire)) {
       return Work();
     }
     Work w = std::move(e->w);
@@ -163,9 +160,8 @@ class RunQueue {
       Elem* e = &array_[mid & kMask];
       uint8_t s = e->state.load(std::memory_order_relaxed);
       if (n == 0) {
-        if (s != kReady ||
-            !e->state.compare_exchange_strong(s, kBusy,
-                                              std::memory_order_acquire))
+        if (s != kReady || !e->state.compare_exchange_strong(
+                               s, kBusy, std::memory_order_acquire))
           continue;
         start = mid;
       } else {
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
index 0f0de8ef9b0..b06c540b756 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
@@ -5,6 +5,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue.h"
+
 #include "paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -64,11 +65,8 @@ class WorkQueueImpl : public WorkQueue {
                                  platform::TracerEventType::UserDefined,
                                  10 /*level*/);
     if (tracker_ != nullptr) {
-      fn = [
-        task = std::move(fn), raii = CounterGuard<TaskTracker>(tracker_)
-      ]() mutable {
-        task();
-      };
+      fn = [task = std::move(fn),
+            raii = CounterGuard<TaskTracker>(tracker_)]() mutable { task(); };
     }
     queue_->AddTask(std::move(fn));
   }
@@ -158,11 +156,8 @@ void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function<void()> fn) {
                                10 /*level*/);
   assert(queue_idx < queues_.size());
   if (queues_options_.at(queue_idx).track_task) {
-    fn = [
-      task = std::move(fn), raii = CounterGuard<TaskTracker>(tracker_)
-    ]() mutable {
-      task();
-    };
+    fn = [task = std::move(fn),
+          raii = CounterGuard<TaskTracker>(tracker_)]() mutable { task(); };
   }
   queues_[queue_idx]->AddTask(std::move(fn));
 }
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.h b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
index 2c2576528fe..1a1900c5687 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.h
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <type_traits>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -118,10 +119,10 @@ class WorkQueue {
         std::bind(std::forward<F>(f), std::forward<Args>(args)...);
     std::promise<ReturnType> prom;
     std::future<ReturnType> res = prom.get_future();
-    AddTask([
-      t = std::move(task),
-      p = FakeCopyable<std::promise<ReturnType>>(std::move(prom))
-    ]() mutable { p.Get().set_value(t()); });
+    AddTask([t = std::move(task), p = FakeCopyable<std::promise<ReturnType>>(
+                                      std::move(prom))]() mutable {
+      p.Get().set_value(t());
+    });
     return res;
   }
 
@@ -158,10 +159,9 @@ class WorkQueueGroup {
         std::bind(std::forward<F>(f), std::forward<Args>(args)...);
     std::promise<ReturnType> prom;
     std::future<ReturnType> res = prom.get_future();
-    AddTask(queue_idx, [
-      t = std::move(task),
-      p = FakeCopyable<std::promise<ReturnType>>(std::move(prom))
-    ]() mutable { p.Get().set_value(t()); });
+    AddTask(queue_idx, [t = std::move(task),
+                        p = FakeCopyable<std::promise<ReturnType>>(std::move(
+                            prom))]() mutable { p.Get().set_value(t()); });
     return res;
   }
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
index 857eaead5b6..3e38d0dbbf9 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue.h"
+
 #include <atomic>
 #include <thread>
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
@@ -37,10 +39,10 @@ TEST(WorkQueueUtils, TestEventsWaiter) {
 
 TEST(WorkQueue, TestSingleThreadedWorkQueue) {
   VLOG(1) << "In Test";
-  using paddle::framework::WorkQueueOptions;
-  using paddle::framework::WorkQueue;
   using paddle::framework::CreateSingleThreadedWorkQueue;
   using paddle::framework::EventsWaiter;
+  using paddle::framework::WorkQueue;
+  using paddle::framework::WorkQueueOptions;
   std::atomic<bool> finished{false};
   std::atomic<unsigned> counter{0};
   constexpr unsigned kLoopNum = 1000000;
@@ -83,10 +85,10 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) {
 
 TEST(WorkQueue, TestMultiThreadedWorkQueue) {
   VLOG(1) << "In Test";
-  using paddle::framework::WorkQueueOptions;
-  using paddle::framework::WorkQueue;
   using paddle::framework::CreateMultiThreadedWorkQueue;
   using paddle::framework::EventsWaiter;
+  using paddle::framework::WorkQueue;
+  using paddle::framework::WorkQueueOptions;
   std::atomic<bool> finished{false};
   std::atomic<unsigned> counter{0};
   constexpr unsigned kExternalLoopNum = 100;
@@ -136,10 +138,10 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) {
 }
 
 TEST(WorkQueue, TestWorkQueueGroup) {
-  using paddle::framework::WorkQueueOptions;
-  using paddle::framework::WorkQueueGroup;
   using paddle::framework::CreateWorkQueueGroup;
   using paddle::framework::EventsWaiter;
+  using paddle::framework::WorkQueueGroup;
+  using paddle::framework::WorkQueueOptions;
   std::atomic<bool> finished{false};
   std::atomic<unsigned> counter{0};
   constexpr unsigned kExternalLoopNum = 100;
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc
index 82dcbbd509d..152f89d9ef0 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
+
 #include <cstdint>
 #include <cstdlib>
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h
index b6e6ede8c33..380746c05d6 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h
@@ -21,6 +21,7 @@
 #include <memory>
 #include <set>
 #include <string>
+
 #include "paddle/fluid/framework/new_executor/workqueue/events_waiter.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference.cc b/paddle/fluid/framework/no_need_buffer_vars_inference.cc
index 25f64838c6d..665c9b811fa 100644
--- a/paddle/fluid/framework/no_need_buffer_vars_inference.cc
+++ b/paddle/fluid/framework/no_need_buffer_vars_inference.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/imperative/saved_variable_wrapper_list.h"
 
diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference_test.cc b/paddle/fluid/framework/no_need_buffer_vars_inference_test.cc
index a92d52fd2e9..a2c7df763a7 100644
--- a/paddle/fluid/framework/no_need_buffer_vars_inference_test.cc
+++ b/paddle/fluid/framework/no_need_buffer_vars_inference_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/imperative/layer.h"
diff --git a/paddle/fluid/framework/op_def_api.cc b/paddle/fluid/framework/op_def_api.cc
index 73f1409ae69..b62f17987e6 100644
--- a/paddle/fluid/framework/op_def_api.cc
+++ b/paddle/fluid/framework/op_def_api.cc
@@ -17,6 +17,7 @@
 #define _LINUX
 #endif
 #include "paddle/fluid/framework/op_def_api.h"
+
 #include <fstream>
 #include <mutex>
 #include <string>
@@ -28,6 +29,7 @@
 #endif
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/op_def.pb.h"
 
diff --git a/paddle/fluid/framework/op_def_api.h b/paddle/fluid/framework/op_def_api.h
index 1ef2254d0da..754b76663df 100644
--- a/paddle/fluid/framework/op_def_api.h
+++ b/paddle/fluid/framework/op_def_api.h
@@ -21,5 +21,5 @@ namespace framework {
 const proto::OpDef& GetOpDef(const std::string& op_name);
 
 bool HasOpDef(const std::string& op_name);
-}
-}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 87d3a048d0b..db2a411da00 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -495,8 +495,9 @@ bool OpDesc::HasProtoAttr(const std::string &name) const {
 
 proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
-                                          "Attribute %s is not found.", name));
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound("Attribute %s is not found.", name));
   return static_cast<proto::AttrType>(it->second.which() - 1);
 }
 
@@ -599,8 +600,9 @@ void OpDesc::SetAttrMap(
 
 Attribute OpDesc::GetAttr(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
-                                          "Attribute %s is not found.", name));
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound("Attribute %s is not found.", name));
   return it->second;
 }
 
@@ -854,10 +856,11 @@ bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
   if (length == 0) {
     return false;
   }
-  PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
-                                     "Input(%s) should have only one value, "
-                                     "but it has %d values now.",
-                                     name, length));
+  PADDLE_ENFORCE_EQ(
+      length, 1UL,
+      platform::errors::InvalidArgument("Input(%s) should have only one value, "
+                                        "but it has %d values now.",
+                                        name, length));
   return block_.HasVarRecursive(input_names[0]);
 }
 
@@ -870,10 +873,11 @@ bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
   if (length == 0) {
     return false;
   }
-  PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
-                                     "Output(%s) should have only one value, "
-                                     "but it has %d values now.",
-                                     name, length));
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    platform::errors::InvalidArgument(
+                        "Output(%s) should have only one value, "
+                        "but it has %d values now.",
+                        name, length));
   return block_.HasVarRecursive(output_names[0]);
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 903ee73b2c0..51aeed2e5d7 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
 namespace paddle {
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index 889b6b0c86b..8b77b1d260c 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -12,11 +12,11 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include "paddle/fluid/framework/op_registry.h"
+
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/framework/op_registry.h"
-
 namespace pd = paddle::framework;
 
 namespace paddle {
@@ -58,8 +58,9 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     AddInput("input", "input of cosine op").AsDuplicable();
     AddOutput("output", "output of cosine op").AsIntermediate();
     auto my_checker = [](int i) {
-      PADDLE_ENFORCE_EQ(i % 2, 0, platform::errors::InvalidArgument(
-                                      "'test_attr' must be even!"));
+      PADDLE_ENFORCE_EQ(
+          i % 2, 0,
+          platform::errors::InvalidArgument("'test_attr' must be even!"));
     };
     AddAttr<int>("test_attr", "a simple test attribute")
         .AddCustomChecker(my_checker);
diff --git a/paddle/fluid/framework/op_version_proto.h b/paddle/fluid/framework/op_version_proto.h
index 9b70bb93bb9..022531d53de 100644
--- a/paddle/fluid/framework/op_version_proto.h
+++ b/paddle/fluid/framework/op_version_proto.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include <string>
 
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index e66d0dc5a1f..8f83631c272 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -12,10 +12,10 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <gtest/gtest.h>
-
 #include "paddle/fluid/framework/op_version_registry.h"
 
+#include <gtest/gtest.h>
+
 namespace paddle {
 namespace framework {
 namespace compatible {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 69f14d7903c..7395a8e0da8 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 
 #include <glog/logging.h>
+
 #include <sstream>
 #include <string>
 
@@ -1205,10 +1206,11 @@ bool OperatorWithKernel::SupportsMKLDNN(
     const proto::VarType::Type data_type) const {
   auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_);
   if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) {
-    VLOG(6) << "Warning: " << type_ << " don't find its MKLDNN Kernel in Fluid "
-                                       "Registered Kernels. And We don't "
-                                       "search its kernels in phi lib, "
-                                       "SupportsMKLDNN() return false.";
+    VLOG(6) << "Warning: " << type_
+            << " don't find its MKLDNN Kernel in Fluid "
+               "Registered Kernels. And We don't "
+               "search its kernels in phi lib, "
+               "SupportsMKLDNN() return false.";
     return false;
   }
   auto& op_kernels = op_kernel_iter->second;
@@ -1440,7 +1442,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #if defined(PADDLE_WITH_XPU_KP)
         && (!is_xpu_unsupport || use_phi_xpu_kp)
 #endif
-            ) {
+    ) {
       run_phi_kernel_ = true;
     } else {
       auto& all_op_kernels = AllOpKernels();
@@ -1464,7 +1466,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #if defined(PADDLE_WITH_XPU_KP)
           || (is_xpu_unsupport && !is_xpu_kp_support)
 #endif
-              ) {
+      ) {
         auto pt_cpu_kernel_key =
             FallBackToCpu(*kernel_type_.get(), pt_kernel_key, *this);
         pt_kernel_.reset(
@@ -2238,8 +2240,9 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
     if (arg_map_fn) {
       arg_map_fn_.reset(new phi::ArgumentMappingFn(*arg_map_fn));
     } else {
-      auto func = [this](
-          const phi::ArgumentMappingContext& ctx) -> phi::KernelSignature {
+      auto func =
+          [this](
+              const phi::ArgumentMappingContext& ctx) -> phi::KernelSignature {
         return phi::DefaultKernelSignatureMap::Instance().Get(type_);
       };
       arg_map_fn_.reset(new phi::ArgumentMappingFn(func));
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 2efa2e4bd8a..dc13287b5aa 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
@@ -38,12 +39,10 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/variant.h"
-#include "paddle/utils/flat_hash_map.h"
-
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 #include "paddle/phi/core/compat/op_utils.h"
 #include "paddle/phi/core/kernel_factory.h"
+#include "paddle/utils/flat_hash_map.h"
 
 namespace paddle {
 namespace framework {
@@ -610,12 +609,12 @@ class OperatorWithKernel : public OperatorBase {
 
   /* member functions for adapting to phi lib */
   /** In the Tensor calculation library, the new Kernel adopts a clearer and
-    * more streamlined design. The arguments of the Kernel and the input and
-    * output arguments registered in the original OpMaker do not match in some
-    * cases, so we use map to record the arguments required by the kernel.
-    * When selecting Kernel during Op execution, select the arguments of the
-    * original Op according to the GetExpectedPhiKernelArgs returned arguments.
-    */
+   * more streamlined design. The arguments of the Kernel and the input and
+   * output arguments registered in the original OpMaker do not match in some
+   * cases, so we use map to record the arguments required by the kernel.
+   * When selecting Kernel during Op execution, select the arguments of the
+   * original Op according to the GetExpectedPhiKernelArgs returned arguments.
+   */
   phi::KernelSignature GetExpectedPhiKernelArgs(
       const ExecutionContext& ctx) const;
 
diff --git a/paddle/fluid/framework/operator_exception_test.cc b/paddle/fluid/framework/operator_exception_test.cc
index 7b513996fb4..0f635d170de 100644
--- a/paddle/fluid/framework/operator_exception_test.cc
+++ b/paddle/fluid/framework/operator_exception_test.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/operator.h"
 #include <exception>
 #include <stdexcept>
 #include <string>
 #include <utility>
+
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h
index ab812a30981..57d377f1389 100644
--- a/paddle/fluid/framework/operator_kernel_configs.h
+++ b/paddle/fluid/framework/operator_kernel_configs.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <mutex>
 #include <unordered_map>
 #include <vector>
+
 #include "glog/logging.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 24e09bcd463..3dda60de12a 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "gtest/gtest.h"
+#include "paddle/fluid/framework/operator.h"
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/init.h"
 
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index 295510cdb1c..a2bdd2bc4c1 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -334,7 +334,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
   }
 
   GraphNodeSet need_feed_vars;
-  std::unordered_set<Node *> param_vars, output_vars;
+  std::unordered_set<Node*> param_vars, output_vars;
   // the subgraph is independently, so here we only need link
   // to the node in new subgraph, and discard the link to
   // out-graph.
@@ -386,18 +386,18 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
                subgraph.get());
   // Save lists of input variables, internal variables and output variables
   // of the cluster as attributes of the subgraph for convenience.
-  auto collect_names_fn = [](
-      const GraphNodeSet& nodes,
-      const std::unordered_set<std::string>& ignore_names) {
-    auto result = std::make_unique<std::vector<std::string>>();
-    for (auto* node : nodes) {
-      if (!node->Var() || ignore_names.count(node->Name())) {
-        continue;
-      }
-      result->emplace_back(node->Name());
-    }
-    return result;
-  };
+  auto collect_names_fn =
+      [](const GraphNodeSet& nodes,
+         const std::unordered_set<std::string>& ignore_names) {
+        auto result = std::make_unique<std::vector<std::string>>();
+        for (auto* node : nodes) {
+          if (!node->Var() || ignore_names.count(node->Name())) {
+            continue;
+          }
+          result->emplace_back(node->Name());
+        }
+        return result;
+      };
   subgraph->Set<std::vector<std::string>>(
       kInternalVars, collect_names_fn(cluster_internals, {}).release());
   subgraph->Set<std::vector<std::string>>(
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index d593aadc02c..e9c517af2c3 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <string>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
index 9b5ce876c25..585f9edce86 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
@@ -100,7 +100,7 @@ size_t CinnCacheKeyByStructure::HashGraph(const ir::Graph& graph) {
 
   // graph.Nodes() return unordered_set, here using set to avoid the same graph
   // may return different result
-  std::set<ir::Node *, bool (*)(ir::Node *, ir::Node *)> node_set(compare),
+  std::set<ir::Node*, bool (*)(ir::Node*, ir::Node*)> node_set(compare),
       output_set(compare);
   node_set.insert(graph.Nodes().begin(), graph.Nodes().end());
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
index 1ebeecbff95..24e65599018 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include <map>
 #include <unordered_set>
 
@@ -21,6 +22,7 @@
 #include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/phi/core/ddim.h"
+// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 12f60354206..2a6a51d73f2 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -51,14 +51,14 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {
 
-using ir::Graph;
-using ir::Node;
-using inference::analysis::Dot;
 using ::cinn::auto_schedule::AutoTuner;
 using ::cinn::common::Target;
 using ::cinn::frontend::Optimize;
 using ::cinn::hlir::framework::BuildScope;
 using ::cinn::hlir::framework::GraphCompiler;
+using inference::analysis::Dot;
+using ir::Graph;
+using ir::Node;
 
 CinnCompiler* CinnCompiler::GetInstance() {
   static CinnCompiler* instance = new CinnCompiler();
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index a38e8b4c5f6..91c55976764 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h"
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index 255e318c9fa..5a84a97ee8d 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -44,8 +44,8 @@ DECLARE_string(deny_cinn_ops);
 namespace paddle {
 namespace framework {
 namespace paddle2cinn {
-using ir::Graph;
 using ::cinn::common::Target;
+using ir::Graph;
 
 namespace {
 template <typename T, typename Alloc = std::allocator<T>>
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
index 31bf8d9b726..4e362057c91 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// clang-format off
 #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
 
 #include <algorithm>
@@ -30,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
+// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
index 526eb65a56e..4155147da4b 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+// clang-format off
 #include <map>
 #include <string>
 #include <unordered_map>
@@ -26,6 +27,7 @@ limitations under the License. */
 
 #include "cinn/frontend/net_builder.h"
 #include "cinn/frontend/op_mapper_registry.h"
+// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
index c0e1ca8f0d1..8a6f92a6f45 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// clang-format off
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
+// clang-format on
 
 namespace paddle {
 namespace framework {
 namespace paddle2cinn {
 
+using ::cinn::frontend::NetBuilder;
 using ir::Graph;
 using ir::Node;
-using ::cinn::frontend::NetBuilder;
 using CinnTensor = ::cinn::hlir::framework::Tensor;
 using OpMapperContext = CinnGraphSymbolization::OpMapperContext;
 using CinnOpDesc = CinnGraphSymbolization::CinnOpDesc;
diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc.h b/paddle/fluid/framework/paddle2cinn/transform_desc.h
index 76a4f812730..6f0931b6d03 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_desc.h
+++ b/paddle/fluid/framework/paddle2cinn/transform_desc.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+// The headers cant be sorted by clang-format or compilint error occurs.
+// clang-format off
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -24,6 +26,7 @@
 #include "cinn/frontend/paddle/cpp/op_desc.h"
 #include "cinn/frontend/paddle/cpp/program_desc.h"
 #include "cinn/frontend/paddle/cpp/var_desc.h"
+// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc
index ba324295cad..ae9f51c3f67 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include <unordered_map>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/paddle2cinn/transform_desc.h"
+// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/transform_type.cc b/paddle/fluid/framework/paddle2cinn/transform_type.cc
index 0e348084d25..60502edd99a 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_type.cc
+++ b/paddle/fluid/framework/paddle2cinn/transform_type.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
+
 #include "cinn/common/type.h"
 #include "cinn/runtime/cinn_runtime.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/paddle2cinn/transform_type.h b/paddle/fluid/framework/paddle2cinn/transform_type.h
index e44960abbd9..f0b08ba1e00 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_type.h
+++ b/paddle/fluid/framework/paddle2cinn/transform_type.h
@@ -19,7 +19,7 @@
 struct cinn_type_t;
 namespace cinn::common {
 struct Type;
-}  // ::cinn::common
+}  // namespace cinn::common
 
 namespace paddle::framework::paddle2cinn {
 
diff --git a/paddle/fluid/framework/paddle2cinn/transform_type_test.cc b/paddle/fluid/framework/paddle2cinn/transform_type_test.cc
index 6c5d360d34c..4456642b3e9 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_type_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/transform_type_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
+
 #include "cinn/common/type.h"
 #include "cinn/runtime/cinn_runtime.h"
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index b088a535a12..00d48098a13 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -666,8 +666,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places, scope)) {
   PADDLE_ENFORCE_EQ(places.size() > 0 && !platform::is_npu_place(places[0]),
-                    true, platform::errors::Unavailable(
-                              "NPU is not supported in ParallelExecutor."));
+                    true,
+                    platform::errors::Unavailable(
+                        "NPU is not supported in ParallelExecutor."));
   InitP2P(places);
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 18d0ee78ffb..3dc9fbcfbf3 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -42,9 +42,9 @@ namespace framework {
 
 class ParallelExecutorPrivate;
 
-using details::VariableInfo;
 using details::BuildStrategy;
 using details::ExecutionStrategy;
+using details::VariableInfo;
 namespace p = paddle::platform;
 using DeviceType = paddle::platform::DeviceType;
 
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index 3eda00006f9..19f7b024b27 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/phi_utils.h"
+
 #include <sstream>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/phi_utils.h"
-
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index 785ede5c601..535672f2e12 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -21,11 +21,10 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
-
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
diff --git a/paddle/fluid/framework/phi_utils_test.cc b/paddle/fluid/framework/phi_utils_test.cc
index cbcdf24c9f3..02eb23f8ac1 100644
--- a/paddle/fluid/framework/phi_utils_test.cc
+++ b/paddle/fluid/framework/phi_utils_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/phi_utils.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 4a31adcca65..88738255af7 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/program_desc.h"
+
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/version.h"
 
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 4ceb0c5c824..7e1c12f4ac5 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/framework/program_processing.cc b/paddle/fluid/framework/program_processing.cc
index 3bcf6f8f385..95b28b79dcf 100644
--- a/paddle/fluid/framework/program_processing.cc
+++ b/paddle/fluid/framework/program_processing.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/program_processing.h"
+
 #include "paddle/fluid/framework/block_desc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index 4c95f01ae56..fbeedcc311a 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <glog/logging.h>
 
 #include <queue>
+
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
index 64b30878150..5fbfda716b4 100644
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/prune.h"
 
 #include <gtest/gtest.h>
+
 #include <string>
 
 #include "paddle/fluid/framework/block_desc.h"
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index aec40a5a7eb..c86bfbc43bf 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <google/protobuf/text_format.h>
+
 #include <cstdlib>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index a12079a135d..7a0fe65182d 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <time.h>
+
 #include "paddle/fluid/framework/device_worker.h"
 
 namespace phi {
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index b418339bf32..27940f726dc 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/reader.h"
+
 #include <deque>
 
 namespace paddle {
diff --git a/paddle/fluid/framework/save_load_util.cc b/paddle/fluid/framework/save_load_util.cc
index 44488fca01c..284965fdfe9 100644
--- a/paddle/fluid/framework/save_load_util.cc
+++ b/paddle/fluid/framework/save_load_util.cc
@@ -342,8 +342,9 @@ bool LoadTensorFromDisk(
     uint32_t version;
     fin.read(reinterpret_cast<char*>(&version), sizeof(version));
     CheckInStreamState(fin, sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, platform::errors::InvalidArgument(
-                                       "Only version 0 tensor is supported."));
+    PADDLE_ENFORCE_EQ(version, 0U,
+                      platform::errors::InvalidArgument(
+                          "Only version 0 tensor is supported."));
     proto::VarType::TensorDesc desc;
     {
       // int32_t size
diff --git a/paddle/fluid/framework/save_load_util_test.cc b/paddle/fluid/framework/save_load_util_test.cc
index 10a34d7ce91..623f0f27bda 100644
--- a/paddle/fluid/framework/save_load_util_test.cc
+++ b/paddle/fluid/framework/save_load_util_test.cc
@@ -11,11 +11,12 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "paddle/fluid/framework/save_load_util.h"
+
 #include <stdlib.h>
 #include <time.h>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/save_load_util.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/scope_guard.h b/paddle/fluid/framework/scope_guard.h
index 83387842e94..9c741f7bfc5 100644
--- a/paddle/fluid/framework/scope_guard.h
+++ b/paddle/fluid/framework/scope_guard.h
@@ -16,6 +16,7 @@
 
 #include <type_traits>
 #include <utility>
+
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
@@ -41,12 +42,12 @@ class ScopeGuard {
 #define _PADDLE_CONCAT_TOKEN(x, y) x##y
 #define PADDLE_CONCAT_TOKEN(x, y) _PADDLE_CONCAT_TOKEN(x, y)
 
-#define DEFINE_PADDLE_SCOPE_GUARD(...)                                     \
-  auto PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__) = __VA_ARGS__;    \
-  ::paddle::framework::ScopeGuard<typename std::remove_reference<decltype( \
-      PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__))>::type>           \
-      PADDLE_CONCAT_TOKEN(__scope_guard, __LINE__)(                        \
-          PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__))
+#define DEFINE_PADDLE_SCOPE_GUARD(...)                                    \
+  auto PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__) = __VA_ARGS__;   \
+  ::paddle::framework::ScopeGuard<typename std::remove_reference<         \
+      decltype(PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__))>::type> \
+  PADDLE_CONCAT_TOKEN(__scope_guard, __LINE__)(                           \
+      PADDLE_CONCAT_TOKEN(__scope_guard_func, __LINE__))
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/scope_guard_test.cc b/paddle/fluid/framework/scope_guard_test.cc
index d7a7a6168a3..793b3a1652a 100644
--- a/paddle/fluid/framework/scope_guard_test.cc
+++ b/paddle/fluid/framework/scope_guard_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/scope_guard.h"
+
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 1f821720d64..7bb8550926d 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_ASCEND_CL)
 #include <float.h>
+
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/framework/selected_rows_utils.h b/paddle/fluid/framework/selected_rows_utils.h
index 8606295c451..9ecff5719fb 100644
--- a/paddle/fluid/framework/selected_rows_utils.h
+++ b/paddle/fluid/framework/selected_rows_utils.h
@@ -21,10 +21,9 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/phi/core/selected_rows.h"
-
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/selected_rows.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/selected_rows_utils_test.cc b/paddle/fluid/framework/selected_rows_utils_test.cc
index f23510c721e..db2c6c1f991 100644
--- a/paddle/fluid/framework/selected_rows_utils_test.cc
+++ b/paddle/fluid/framework/selected_rows_utils_test.cc
@@ -9,11 +9,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/selected_rows_utils.h"
+
 #include <time.h>
+
 #include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/selected_rows_utils.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/string_array.cc b/paddle/fluid/framework/string_array.cc
old mode 100755
new mode 100644
index 3071e6bf4cf..f6aee9b82f2
--- a/paddle/fluid/framework/string_array.cc
+++ b/paddle/fluid/framework/string_array.cc
@@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/string_array.h"
+
 #include <utf8proc.h>
 
 #include <exception>
 
 #include "glog/logging.h"
-#include "paddle/fluid/framework/string_array.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 57eddf782f0..7ad9839d79d 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -23,15 +23,14 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/stream.h"
-
-#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/stream.h"
 
 namespace paddle {
 
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index f5e230773fb..946b119ecb3 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -24,12 +24,13 @@ namespace framework {
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   int rank = src.dims().size();
   PADDLE_ENFORCE_GE(
-      rank, 2, platform::errors::InvalidArgument(
-                   "'ReshapeToMatrix()' is only used for flatten high rank "
-                   "tensors to matrixs. The dimensions of Tensor must be "
-                   "greater or equal than 2. "
-                   "But received dimensions of Tensor is %d",
-                   rank));
+      rank, 2,
+      platform::errors::InvalidArgument(
+          "'ReshapeToMatrix()' is only used for flatten high rank "
+          "tensors to matrixs. The dimensions of Tensor must be "
+          "greater or equal than 2. "
+          "But received dimensions of Tensor is %d",
+          rank));
   if (rank == 2) {
     return src;
   }
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 3e104807535..05dd41eb6ff 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/tensor.h"
 
 #include <gtest/gtest.h>
+
 #include <string>
 
 namespace framework = paddle::framework;
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1159280762f..1e25acb2c4e 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/tensor_util.h"
+
 #include <algorithm>
 #include <limits>
 #include <memory>
@@ -21,10 +23,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
-
 #include "paddle/phi/core/dense_tensor.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -1249,10 +1249,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
      // proto buffer
     int32_t size = -1;
     is.read(reinterpret_cast<char*>(&size), sizeof(size));
-    PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable(
-                                           "Cannot read tensor desc size"));
-    PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument(
-                                   "Tensor desc size should >= 0"));
+    PADDLE_ENFORCE_EQ(
+        is.good(), true,
+        platform::errors::Unavailable("Cannot read tensor desc size"));
+    PADDLE_ENFORCE_GE(
+        size, 0,
+        platform::errors::InvalidArgument("Tensor desc size should >= 0"));
     std::unique_ptr<char[]> buf(new char[size]);
     is.read(reinterpret_cast<char*>(buf.get()), size);
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 5e6e1227b1a..2511fdf27ce 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/tensor_util.h"
 
 #include <gtest/gtest.h>
+
 #include <cmath>
 
 namespace paddle {
@@ -254,64 +255,61 @@ TEST(TensorToVector, Tensor) {
 #endif
 }
 
-TEST(TensorToVector, Tensor_bool) {
-  {
-    paddle::framework::Tensor src;
-    bool* src_ptr =
-        src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
-    for (int i = 0; i < 3 * 3; ++i) {
-      src_ptr[i] = static_cast<bool>(i % 2);
-    }
+TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src;
+bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
+for (int i = 0; i < 3 * 3; ++i) {
+  src_ptr[i] = static_cast<bool>(i % 2);
+}
 
-    paddle::platform::CPUPlace place;
-    std::vector<bool> dst;
-    paddle::framework::TensorToVector<bool>(src, &dst);
+paddle::platform::CPUPlace place;
+std::vector<bool> dst;
+paddle::framework::TensorToVector<bool>(src, &dst);
 
-    for (int i = 0; i < 3 * 3; ++i) {
-      EXPECT_EQ(src_ptr[i], dst[i]);
-    }
-  }
+for (int i = 0; i < 3 * 3; ++i) {
+  EXPECT_EQ(src_ptr[i], dst[i]);
+}
+}  // namespace framework
 #ifdef PADDLE_WITH_CUDA
-  {
-    std::vector<bool> src_vec = {
-        false, true, false, true, false, true, false, true, false,
-    };
-    paddle::framework::Tensor gpu_tensor;
-    paddle::platform::CUDAPlace place;
-    paddle::platform::CUDADeviceContext gpu_ctx(place);
-    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                             .GetAllocator(place, gpu_ctx.stream())
-                             .get());
-    gpu_ctx.PartialInitWithAllocator();
-    paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
-
-    std::vector<bool> dst;
-    paddle::framework::TensorToVector<bool>(gpu_tensor, gpu_ctx, &dst);
-
-    for (int i = 0; i < 3 * 3; ++i) {
-      EXPECT_EQ(src_vec[i], dst[i]);
-    }
+{
+  std::vector<bool> src_vec = {
+      false, true, false, true, false, true, false, true, false,
+  };
+  paddle::framework::Tensor gpu_tensor;
+  paddle::platform::CUDAPlace place;
+  paddle::platform::CUDADeviceContext gpu_ctx(place);
+  gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, gpu_ctx.stream())
+                           .get());
+  gpu_ctx.PartialInitWithAllocator();
+  paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
+
+  std::vector<bool> dst;
+  paddle::framework::TensorToVector<bool>(gpu_tensor, gpu_ctx, &dst);
+
+  for (int i = 0; i < 3 * 3; ++i) {
+    EXPECT_EQ(src_vec[i], dst[i]);
   }
+}
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
-  {
-    std::vector<bool> src_vec = {
-        false, true, false, true, false, true, false, true, false,
-    };
-    paddle::framework::Tensor npu_tensor;
-    paddle::platform::NPUPlace place(0);
-    paddle::platform::NPUDeviceContext npu_ctx(place);
-    paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
-
-    std::vector<bool> dst;
-    paddle::framework::TensorToVector<bool>(npu_tensor, npu_ctx, &dst);
-
-    for (int i = 0; i < 3 * 3; ++i) {
-      EXPECT_EQ(src_vec[i], dst[i]);
-    }
+{
+  std::vector<bool> src_vec = {
+      false, true, false, true, false, true, false, true, false,
+  };
+  paddle::framework::Tensor npu_tensor;
+  paddle::platform::NPUPlace place(0);
+  paddle::platform::NPUDeviceContext npu_ctx(place);
+  paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
+
+  std::vector<bool> dst;
+  paddle::framework::TensorToVector<bool>(npu_tensor, npu_ctx, &dst);
+
+  for (int i = 0; i < 3 * 3; ++i) {
+    EXPECT_EQ(src_vec[i], dst[i]);
   }
-#endif
 }
+#endif
+}  // namespace paddle
 
 TEST(TensorFromDLPack, Tensor) {
   {
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 33533b1d10f..b704ac4329d 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -43,8 +43,9 @@ void ThreadPool::Init() {
       num_threads = FLAGS_dist_threadpool_size;
       VLOG(1) << "set dist_threadpool_size to " << num_threads;
     }
-    PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument(
-                                          "The number of threads is 0."));
+    PADDLE_ENFORCE_GT(
+        num_threads, 0,
+        platform::errors::InvalidArgument("The number of threads is 0."));
     threadpool_.reset(new ThreadPool(num_threads));
   }
 }
diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc
index 1278a0f0643..0b6e12967fe 100644
--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/threadpool.h"
+
 #include <gtest/gtest.h>
+
 #include <atomic>
 
 namespace framework = paddle::framework;
diff --git a/paddle/fluid/framework/trainer.cc b/paddle/fluid/framework/trainer.cc
index b033f9a99d6..dc48a8f8d8f 100644
--- a/paddle/fluid/framework/trainer.cc
+++ b/paddle/fluid/framework/trainer.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/trainer.h"
+
 #include "io/fs.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index 1f1122d32f5..48ea9143d62 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/trainer_factory.h"
 
 #include <stdlib.h>
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/framework/trainer_test.cc b/paddle/fluid/framework/trainer_test.cc
index f689679d486..1f4a162f906 100644
--- a/paddle/fluid/framework/trainer_test.cc
+++ b/paddle/fluid/framework/trainer_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/trainer.h"
+
 #include <gtest/gtest.h>
 
 namespace paddle {
@@ -23,5 +24,5 @@ TEST() {
   // create dataset
   // train for a while
 }
-}
-}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 0937d96ad4c..5feedb2c3d6 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/variant.h"
 #include "paddle/utils/small_vector.h"
diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc
index 2f03dc41ce0..43c44ff525f 100644
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/unused_var_check.h"
 
 #include <glog/logging.h>
+
 #include <string>
 
 #include "gflags/gflags.h"
diff --git a/paddle/fluid/framework/unused_var_check.h b/paddle/fluid/framework/unused_var_check.h
index 95f6917fbcd..cc4977e439c 100644
--- a/paddle/fluid/framework/unused_var_check.h
+++ b/paddle/fluid/framework/unused_var_check.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <glog/logging.h>
+
 #include <string>
 #include <unordered_set>
 
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index 0a24efd003b..3a3edc9b4c6 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -318,18 +318,20 @@ void VarDesc::SetAttr(const std::string &name, const Attribute &v) {
   bool valid = attr_type == proto::AttrType::INT ||
                attr_type == proto::AttrType::STRING ||
                attr_type == proto::AttrType::INTS;
-  PADDLE_ENFORCE_EQ(valid, true, platform::errors::InvalidArgument(
-                                     "The value for attr (%s) must be "
-                                     "one of list or int or string.",
-                                     name));
+  PADDLE_ENFORCE_EQ(
+      valid, true,
+      platform::errors::InvalidArgument("The value for attr (%s) must be "
+                                        "one of list or int or string.",
+                                        name));
 
   this->attrs_[name] = v;
 }
 
 Attribute VarDesc::GetAttr(const std::string &name) const {
   auto it = attrs_.find(name);
-  PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
-                                          "Attribute %s is not found.", name));
+  PADDLE_ENFORCE_NE(
+      it, attrs_.end(),
+      platform::errors::NotFound("Attribute %s is not found.", name));
   return it->second;
 }
 
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 5483ef01c08..ce489a57a01 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -245,9 +245,12 @@ TEST(InferVarType, multiple_api) {
   ASSERT_ANY_THROW(infer.SetDataTypes(&ctx, "test2_a_out", {}));
 
   ASSERT_EQ(0u, infer.GetShape(&ctx, "test2_a_out").size());
-  infer.SetShape(&ctx, "test2_a_out", {
-                                          1, 3, 3,
-                                      });
+  infer.SetShape(&ctx, "test2_a_out",
+                 {
+                     1,
+                     3,
+                     3,
+                 });
   ASSERT_EQ(3u, infer.GetShape(&ctx, "test2_a_out").size());
 
   ASSERT_EQ(0, infer.GetLoDLevel(&ctx, "test2_a_out"));
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index ec664b4513f..345928666bd 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/var_type_traits.h"
+
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/scope.h"
@@ -25,6 +26,7 @@
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 #include <cudnn.h>
+
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #endif
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
index 00ae5154f83..4a81f66948d 100644
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -12,13 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/var_type_traits.h"
+
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
-#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #ifdef PADDLE_WITH_CUDA
 #if defined(PADDLE_WITH_NCCL)
diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc
index 92042e47259..c01bef79cdc 100644
--- a/paddle/fluid/framework/version.cc
+++ b/paddle/fluid/framework/version.cc
@@ -24,7 +24,7 @@ bool IsProgramVersionSupported(int64_t version) {
    * new version. The compatibility judgment cannot be made only
    * by the version number. Please do not use this interface,
    * it may be discarded because backward compatibility.
-  */
+   */
   return true;
 }
 
@@ -33,7 +33,7 @@ bool IsTensorVersionSupported(uint32_t version) {
    * new version. The compatibility judgment cannot be made only
    * by the version number. Please do not use this interface,
    * it may be discarded because backward compatibility.
-  */
+   */
   return true;
 }
 
diff --git a/paddle/fluid/framework/version_test.cc b/paddle/fluid/framework/version_test.cc
index ec5a340ee6e..7c52209981f 100644
--- a/paddle/fluid/framework/version_test.cc
+++ b/paddle/fluid/framework/version_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/version.h"
+
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index 436e22f00c3..f6484d5cdda 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -15,6 +15,7 @@
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 
 #include "paddle/fluid/imperative/all_reduce.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 
 #ifdef PADDLE_WITH_NCCL
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 3f6863d642c..ff6e297ba80 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/amp_auto_cast.h"
+
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/type_defs.h"
@@ -302,9 +304,8 @@ static inline framework::proto::VarType::Type GetPromoteType(
   // dtype of input(X)
   if (op_type == "moving_average_abs_max_scale") {
     for (const auto& pair : ins) {
-      if (pair.first == "X" &&
-          GetDataType<VarType>(pair.second.front()) ==
-              framework::proto::VarType::FP16) {
+      if (pair.first == "X" && GetDataType<VarType>(pair.second.front()) ==
+                                   framework::proto::VarType::FP16) {
         dst_type = framework::proto::VarType::FP16;
       }
     }
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index 49761a8df0b..fcc30b2590a 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -19,6 +19,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 11abbfe7cf6..9990fde95ce 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -14,13 +14,14 @@
 
 #if defined(PADDLE_WITH_XPU_BKCL)
 
+#include "paddle/fluid/imperative/bkcl_context.h"
+
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/imperative/bkcl_context.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -46,10 +47,11 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
   auto bkcl_dtype =
       platform::ToBKCLDataType(framework::TransToProtoVarType(src.dtype()));
 
-  PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), src_ptr, dst_ptr, src.numel(),
-                                    bkcl_dtype, BKCL_ADD, stream),
-                    BKCL_SUCCESS, platform::errors::PreconditionNotMet(
-                                      "BKCL all reduce failed"));
+  PADDLE_ENFORCE_EQ(
+      bkcl_all_reduce(comm->comm(), src_ptr, dst_ptr, src.numel(), bkcl_dtype,
+                      BKCL_ADD, stream),
+      BKCL_SUCCESS,
+      platform::errors::PreconditionNotMet("BKCL all reduce failed"));
 }
 /*
 Baidu Kunlun Communication Library(BKCL) is designed for multi Baidu Kunlun
diff --git a/paddle/fluid/imperative/cncl_context.cc b/paddle/fluid/imperative/cncl_context.cc
index 779b748c2d2..19f22e74029 100644
--- a/paddle/fluid/imperative/cncl_context.cc
+++ b/paddle/fluid/imperative/cncl_context.cc
@@ -18,14 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
-
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/gen_comm_id_helper.h"
-#include "paddle/fluid/platform/place.h"
-
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/mlu/cncl_helper.h"
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
@@ -184,8 +182,9 @@ paddle::platform::DeviceContext *CNCLParallelContext::GetDeviceContext(
 }
 
 void CNCLParallelContext::WaitCompute(int ring_id) {
-  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
-                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_GE(
+      ring_id, 0,
+      platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id));
   PADDLE_ENFORCE_LT(ring_id, compute_events_.size(),
                     platform::errors::OutOfRange(
                         "ring id must < compute events size,"
@@ -205,8 +204,9 @@ void CNCLParallelContext::WaitCompute(int ring_id) {
 }
 
 void CNCLParallelContext::WaitComm(int ring_id) {
-  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
-                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_GE(
+      ring_id, 0,
+      platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id));
   PADDLE_ENFORCE_LT(ring_id, comm_events_.size(),
                     platform::errors::OutOfRange(
                         "ring id must < comm events size,"
diff --git a/paddle/fluid/imperative/data_loader.cc b/paddle/fluid/imperative/data_loader.cc
index c43149c9b56..66eed298106 100644
--- a/paddle/fluid/imperative/data_loader.cc
+++ b/paddle/fluid/imperative/data_loader.cc
@@ -19,6 +19,7 @@
 #include <stdlib.h>
 #include <sys/wait.h>
 #include <unistd.h>
+
 #include <csignal>
 
 #include "glog/logging.h"
diff --git a/paddle/fluid/imperative/data_loader.h b/paddle/fluid/imperative/data_loader.h
index fdfa117eafe..e66a3b9edc3 100644
--- a/paddle/fluid/imperative/data_loader.h
+++ b/paddle/fluid/imperative/data_loader.h
@@ -17,6 +17,7 @@
 #ifndef _WIN32
 
 #include <unistd.h>
+
 #include <cstdint>
 #include <set>
 
diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h
index 124c31df733..fe426a76b32 100644
--- a/paddle/fluid/imperative/execution_context.h
+++ b/paddle/fluid/imperative/execution_context.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/variable.h"
diff --git a/paddle/fluid/imperative/flags.cc b/paddle/fluid/imperative/flags.cc
index c2d668eccda..df424b32fca 100644
--- a/paddle/fluid/imperative/flags.cc
+++ b/paddle/fluid/imperative/flags.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/flags.h"
+
 #include "paddle/fluid/platform/flags.h"
 
 PADDLE_DEFINE_EXPORTED_uint64(dygraph_debug, 0,
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index dd34b8b619f..c5bcab4daa9 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/gloo_context.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #include "paddle/fluid/framework/tensor_util.h"
diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h
index 23e4e02945b..5e0973e7e99 100644
--- a/paddle/fluid/imperative/gloo_context.h
+++ b/paddle/fluid/imperative/gloo_context.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/variable.h"
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 499cf4d8ad6..36e6f551dc6 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -874,8 +874,9 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           }
 
           PADDLE_ENFORCE_EQ(var_info.var->Var().IsType<framework::LoDTensor>(),
-                            true, platform::errors::PermissionDenied(
-                                      "Gradient var must be LoDTensor"));
+                            true,
+                            platform::errors::PermissionDenied(
+                                "Gradient var must be LoDTensor"));
           if (CurCnt() == 0) {
             MoveOrCopyVar(dst_var->MutableVar(), var_info.var->MutableVar(),
                           var_info.unchange_input);
@@ -896,9 +897,10 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           PADDLE_ENFORCE_EQ(
               var_info.var->Var().IsType<framework::LoDTensor>() ||
                   var_info.var->Var().IsType<phi::SelectedRows>(),
-              true, platform::errors::PermissionDenied("The type of Gradient "
-                                                       "var must be LoDTensor "
-                                                       "or SelectedRows"));
+              true,
+              platform::errors::PermissionDenied("The type of Gradient "
+                                                 "var must be LoDTensor "
+                                                 "or SelectedRows"));
           if (CurCnt() == 0) {
             MoveOrCopyVar(dst_var->MutableVar(), var_info.var->MutableVar(),
                           var_info.unchange_input);
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index 03f6775defc..382623b6276 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/layer.h"
diff --git a/paddle/fluid/imperative/hccl_context.cc b/paddle/fluid/imperative/hccl_context.cc
index 31d988753f2..8fb434cbc2a 100644
--- a/paddle/fluid/imperative/hccl_context.cc
+++ b/paddle/fluid/imperative/hccl_context.cc
@@ -13,18 +13,16 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/hccl_context.h"
-#include "paddle/fluid/framework/convert_utils.h"
 
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
-
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
 
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
-
 namespace paddle {
 namespace framework {
 class Variable;
@@ -193,8 +191,9 @@ paddle::platform::DeviceContext *HCCLParallelContext::GetDeviceContext(
 }
 
 void HCCLParallelContext::WaitCompute(int ring_id) {
-  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
-                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_GE(
+      ring_id, 0,
+      platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id));
   PADDLE_ENFORCE_LT(ring_id, compute_events_.size(),
                     platform::errors::OutOfRange(
                         "ring id must < compute events size,"
@@ -214,8 +213,9 @@ void HCCLParallelContext::WaitCompute(int ring_id) {
 }
 
 void HCCLParallelContext::WaitComm(int ring_id) {
-  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
-                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_GE(
+      ring_id, 0,
+      platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id));
   PADDLE_ENFORCE_LT(ring_id, comm_events_.size(),
                     platform::errors::OutOfRange(
                         "ring id must < comm events size,"
diff --git a/paddle/fluid/imperative/infer_var_type_context.h b/paddle/fluid/imperative/infer_var_type_context.h
index 297ec840db4..079e180c2a7 100644
--- a/paddle/fluid/imperative/infer_var_type_context.h
+++ b/paddle/fluid/imperative/infer_var_type_context.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/imperative/type_defs.h"
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc
index 35ff262fe3d..e0f52beb6e5 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 76f64ab73a6..7357db4e200 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -16,7 +16,6 @@
 
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/framework/convert_utils.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/infer_var_type_context.h"
 #include "paddle/fluid/imperative/op_base.h"
@@ -284,9 +283,10 @@ std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
   PADDLE_ENFORCE_EQ(
       Var().IsInitialized() && (Var().IsType<framework::LoDTensor>() ||
                                 Var().IsType<phi::SelectedRows>()),
-      true, platform::errors::InvalidArgument(
-                "Variable is not initialized or Variable's type is not "
-                "LoDTensor or SelectedRows when getting numpy tensor"));
+      true,
+      platform::errors::InvalidArgument(
+          "Variable is not initialized or Variable's type is not "
+          "LoDTensor or SelectedRows when getting numpy tensor"));
 
   if (Var().IsType<framework::LoDTensor>()) {
     auto& src_tensor = Var().Get<framework::LoDTensor>();
diff --git a/paddle/fluid/imperative/layout_autotune.cc b/paddle/fluid/imperative/layout_autotune.cc
index ed0526eaad3..e936505b2ae 100644
--- a/paddle/fluid/imperative/layout_autotune.cc
+++ b/paddle/fluid/imperative/layout_autotune.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/layout_autotune.h"
+
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/imperative/layout_transformer.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
@@ -119,8 +120,9 @@ paddle::imperative::NameVarMap<VarType> AutoTuneLayout(
         LayoutAutoTune::Instance().SetDesiredLayout(DataLayout::NHWC);
         VLOG(3) << "Tune the layout from "
                 << BOOST_GET_CONST(std::string, (*attrs)["data_format"])
-                << " to " << paddle::framework::DataLayoutToString(
-                                 LayoutAutoTune::Instance().GetDesiredLayout());
+                << " to "
+                << paddle::framework::DataLayoutToString(
+                       LayoutAutoTune::Instance().GetDesiredLayout());
       } else {
         LayoutAutoTune::Instance().DisableLayoutAutoTune();
         return ins;
diff --git a/paddle/fluid/imperative/layout_autotune.h b/paddle/fluid/imperative/layout_autotune.h
index df3772b826d..2da368910e6 100644
--- a/paddle/fluid/imperative/layout_autotune.h
+++ b/paddle/fluid/imperative/layout_autotune.h
@@ -14,8 +14,10 @@
 
 #pragma once
 #include <glog/logging.h>
+
 #include <memory>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/phi/common/layout.h"
 
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index e9d987cc704..4a0dcb1b3bb 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -22,6 +22,7 @@
 
 #ifdef PADDLE_WITH_NCCL
 #include <nccl.h>
+
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
 
@@ -159,8 +160,9 @@ paddle::platform::DeviceContext *NCCLParallelContext::GetDeviceContext(
 }
 
 void NCCLParallelContext::WaitCompute(int ring_id) {
-  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
-                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_GE(
+      ring_id, 0,
+      platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id));
   PADDLE_ENFORCE_LT(ring_id, compute_events_.size(),
                     platform::errors::OutOfRange(
                         "ring id must < compute events size,"
@@ -185,8 +187,9 @@ void NCCLParallelContext::WaitCompute(int ring_id) {
 }
 
 void NCCLParallelContext::WaitComm(int ring_id) {
-  PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange(
-                                    "ring id must >= 0, but got %d", ring_id));
+  PADDLE_ENFORCE_GE(
+      ring_id, 0,
+      platform::errors::OutOfRange("ring id must >= 0, but got %d", ring_id));
   PADDLE_ENFORCE_LT(ring_id, comm_events_.size(),
                     platform::errors::OutOfRange(
                         "ring id must < comm events size,"
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index b8a616ae67d..ba0221a1729 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/saved_variable_wrapper_list.h"
 #include "paddle/fluid/imperative/type_defs.h"
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index f2f64d92a23..a4baca6f257 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -24,6 +24,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 #include "paddle/fluid/imperative/layer.h"
diff --git a/paddle/fluid/imperative/partial_grad_engine.h b/paddle/fluid/imperative/partial_grad_engine.h
index b5da39f8d42..4ec6cdb3fcd 100644
--- a/paddle/fluid/imperative/partial_grad_engine.h
+++ b/paddle/fluid/imperative/partial_grad_engine.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index cfd3813d60d..ac997557863 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -258,7 +258,7 @@ PreparedOp PrepareImpl(
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
         && !is_xpu_unsupport
 #endif
-        ) {
+    ) {
       VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
               << " | kernel key: " << pt_kernel_key
               << " | kernel: " << phi_kernel;
@@ -306,7 +306,7 @@ PreparedOp PrepareImpl(
 #if defined(PADDLE_WITH_XPU_KP)
       || (is_xpu_unsupport && !is_xpu_kp_support)
 #endif
-          ) {
+  ) {
     if (has_phi_kernel) {
       auto pt_cpu_kernel_key =
           FallBackToCpu(expected_kernel_key, pt_kernel_key, op);
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index ccc8d64517f..0c2d70dfe3c 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
@@ -28,8 +29,6 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/var_helper.h"
-
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/selected_rows.h"
diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc
index 48af63056c5..097f62fe422 100644
--- a/paddle/fluid/imperative/profiler.cc
+++ b/paddle/fluid/imperative/profiler.cc
@@ -18,7 +18,9 @@
 #include "gperftools/profiler.h"
 #endif
 #include <glog/logging.h>
+
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/flags.h"
 
 PADDLE_DEFINE_EXPORTED_string(
diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h
index 2d7d3192038..f5951a52d71 100644
--- a/paddle/fluid/imperative/py_layer_fwd.h
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -16,12 +16,12 @@
 
 #include <string>
 #include <vector>
-#include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/imperative/prepared_operator.h"
-#include "paddle/fluid/imperative/tracer.h"
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/prepared_operator.h"
+#include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/operators/py_layer_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index c7fd2215eb4..47d7b6366f7 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -18,13 +18,10 @@
 
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/string/string_helper.h"
-
+#include "paddle/fluid/imperative/parallel_context.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
-
-#include "paddle/fluid/imperative/parallel_context.h"
-
+#include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/dense_tensor.h"
 namespace paddle {
 namespace imperative {
@@ -452,8 +449,9 @@ void Reducer::InitializeDenseGroups(
                           "Tensor %s is not initialized.", var_name));
     const auto size = lod_tensor->numel();
     PADDLE_ENFORCE_GT(
-        size, 0, platform::errors::PreconditionNotMet(
-                     "The number of tensor %s's elements is 0.", var_name));
+        size, 0,
+        platform::errors::PreconditionNotMet(
+            "The number of tensor %s's elements is 0.", var_name));
     all_length += size;
 
     p_group->length_.push_back(size);
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index 9fac4b41cbd..852d8cf076a 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <ThreadPool.h>
+
 #include <algorithm>
 #include <iostream>
 #include <map>
diff --git a/paddle/fluid/imperative/tests/bkcl_context_test.cc b/paddle/fluid/imperative/tests/bkcl_context_test.cc
index 580d86b1696..b4d299ba829 100644
--- a/paddle/fluid/imperative/tests/bkcl_context_test.cc
+++ b/paddle/fluid/imperative/tests/bkcl_context_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <thread>  // NOLINT
-
 #include "paddle/fluid/imperative/bkcl_context.h"
 
+#include <thread>  // NOLINT
+
 #include "gtest/gtest.h"
 
 namespace imperative = paddle::imperative;
diff --git a/paddle/fluid/imperative/tests/cncl_context_test.cc b/paddle/fluid/imperative/tests/cncl_context_test.cc
index 1d5ee8e7fc8..1019d4eacdc 100644
--- a/paddle/fluid/imperative/tests/cncl_context_test.cc
+++ b/paddle/fluid/imperative/tests/cncl_context_test.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/imperative/cncl_context.h"
+
 #include <thread>  // NOLINT
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/imperative/cncl_context.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
-#include "gtest/gtest.h"
-
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
 namespace framework = paddle::framework;
diff --git a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
index 91f38f82ed0..67059916d03 100644
--- a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/imperative/heter_ccl_context.h"
+
 #include <chrono>
 #include <thread>  // NOLINT
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/imperative/heter_ccl_context.h"
-
-#include "gtest/gtest.h"
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
index 9ee083626c5..48479e1412b 100644
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/imperative/nccl_context.h"
+
 #include <thread>  // NOLINT
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/imperative/nccl_context.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
-#include "gtest/gtest.h"
-
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
 namespace framework = paddle::framework;
diff --git a/paddle/fluid/imperative/tests/test_eager.cc b/paddle/fluid/imperative/tests/test_eager.cc
index 3def103ae9a..1d6ec733075 100644
--- a/paddle/fluid/imperative/tests/test_eager.cc
+++ b/paddle/fluid/imperative/tests/test_eager.cc
@@ -88,8 +88,9 @@ TEST(test_var_helper, eager_var_helper) {
       egr_tensor, framework::OpKernelType(framework::proto::VarType::FP32,
                                           platform::CPUPlace()));
   SetCachedValue<egr::EagerVariable>(
-      egr_tensor, framework::OpKernelType(framework::proto::VarType::FP32,
-                                          platform::CPUPlace()),
+      egr_tensor,
+      framework::OpKernelType(framework::proto::VarType::FP32,
+                              platform::CPUPlace()),
       egr_tensor2);
   ASSERT_ANY_THROW(GetPlace<egr::EagerVariable>(egr_tensor2));
   ASSERT_ANY_THROW(SetType<egr::EagerVariable>(
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
index 88b18a4c176..d2e768d6ef1 100644
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -384,7 +384,7 @@ static void TestGradientAccumulatorTestUnchangeInput(
     for (auto use_tensor2 : use_tensors) {
       /** g_accum1 && g_accum2: has not been initialized
        *    test accumulate on this graph
-      */
+       */
       auto g_var1 = std::make_shared<VariableWrapper>("g_var1");
       g_var1->SetOverridedStopGradient(false);
       auto g_accum1 = CreateAccumulator(g_var1, sort_gradient);
@@ -437,7 +437,7 @@ static void TestGradientAccumulatorTestUnchangeInput(
 
       /** g_accum3 && g_accum4: has been initialized
        *    test accumulate on previous graph
-      */
+       */
       auto var3 = create_var(use_tensor1);
       auto var_wrapper3_3 = std::make_shared<VariableWrapper>("tmp1_3");
       auto var_wrapper4_3 = std::make_shared<VariableWrapper>("tmp2_3");
diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc
index 5e674af1a08..0025103c531 100644
--- a/paddle/fluid/imperative/tests/test_group.cc
+++ b/paddle/fluid/imperative/tests/test_group.cc
@@ -14,8 +14,8 @@
 
 #include <sstream>
 #include <string>
-#include "gtest/gtest.h"
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/imperative/reducer.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index 4cda3f32fdf..cfda7a0cac4 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -17,9 +17,11 @@
 //
 
 #include <paddle/fluid/framework/op_registry.h>
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/imperative/prepared_operator.h"
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 350263bc545..2295ea4bf67 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/imperative/tracer.h"
+
 #include <map>
 #include <set>
 #include <unordered_set>
 #include <utility>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/execution_context.h"
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 4e671d52457..b9048c48470 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
diff --git a/paddle/fluid/imperative/var_helper.h b/paddle/fluid/imperative/var_helper.h
index 9ce456b1103..91788e73fa5 100644
--- a/paddle/fluid/imperative/var_helper.h
+++ b/paddle/fluid/imperative/var_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/variable.h"
 
 namespace egr {
diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
index 14a1c3eea34..a95498d82d0 100644
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <glog/logging.h>
+
 #include <iosfwd>
 #include <string>
 
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index be7d6ab8680..2b56f8e00d6 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/analysis/passes/passes.h"
 #include "paddle/fluid/string/pretty_log.h"
 
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 4db54706285..95a985158e6 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -37,6 +37,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/inference/analysis/flags.h"
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 3f96fd69e4e..84fcd4e3c39 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/analyzer.h"
-
 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h
index 6d883f55870..619e3461d3e 100644
--- a/paddle/fluid/inference/analysis/dot.h
+++ b/paddle/fluid/inference/analysis/dot.h
@@ -20,6 +20,7 @@
 #pragma once
 
 #include <glog/logging.h>
+
 #include <sstream>
 #include <string>
 #include <unordered_map>
diff --git a/paddle/fluid/inference/analysis/dot_tester.cc b/paddle/fluid/inference/analysis/dot_tester.cc
index c785a312bf9..0b669093a1f 100644
--- a/paddle/fluid/inference/analysis/dot_tester.cc
+++ b/paddle/fluid/inference/analysis/dot_tester.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/dot.h"
-
 #include <gtest/gtest.h>
+
 #include <memory>
 
+#include "paddle/fluid/inference/analysis/dot.h"
+
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 88ae61ff1fc..f9520165161 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <sys/stat.h>
+
 #include <cstdio>
 #include <fstream>
 #include <memory>
@@ -72,8 +73,9 @@ struct DataTypeNamer {
   template <typename T>
   const std::string &repr() const {
     auto x = std::type_index(typeid(T));
-    PADDLE_ENFORCE_GT(dic_.count(x), 0, platform::errors::PreconditionNotMet(
-                                            "unknown type for representation"));
+    PADDLE_ENFORCE_GT(dic_.count(x), 0,
+                      platform::errors::PreconditionNotMet(
+                          "unknown type for representation"));
     return dic_.at(x);
   }
 
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index c5c60564b0f..6c74d7b738c 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+
 #include <map>
 #include <memory>
 #include <string>
@@ -20,6 +21,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
@@ -29,8 +31,8 @@
 namespace paddle {
 namespace inference {
 namespace analysis {
-using string::PrettyLogEndl;
 using string::PrettyLog;
+using string::PrettyLogEndl;
 using string::Style;
 
 IRPassManager::IRPassManager(Argument *argument) {
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h
index 823dc8907ea..9f9a5fc3471 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -27,6 +27,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
index 8f789139af9..b2a07722829 100644
--- a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
@@ -11,19 +11,19 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <algorithm>
-#include <map>
-#include <set>
+#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h"
 
+#include <algorithm>
 #include <fstream>
 #include <iostream>
+#include <map>
+#include <set>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/subgraph_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h"
-#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
@@ -52,18 +52,39 @@ using framework::ir::Node;
 
 void analysis::DlnneSubgraphPass::ApplyImpl(framework::ir::Graph *graph) const {
   static std::unordered_set<std::string> teller_set{
-      "mul", "matmul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
-      "hard_swish", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-      "elementwise_add", "elementwise_mul", "dropout", "prelu",
-      "conv2d_transpose", "leaky_relu",
+      "mul",
+      "matmul",
+      "conv2d",
+      "pool2d",
+      "relu",
+      "softmax",
+      "sigmoid",
+      "hard_swish",
+      "depthwise_conv2d",
+      "batch_norm",
+      "concat",
+      "tanh",
+      "pad",
+      "elementwise_add",
+      "elementwise_mul",
+      "dropout",
+      "prelu",
+      "conv2d_transpose",
+      "leaky_relu",
       // "fc",
-      "shuffle_channel", "swish", "split",
+      "shuffle_channel",
+      "swish",
+      "split",
       // "instance_norm",
       "gelu",
       // "layer_norm",
       // "scale",
       // "stack",
-      "relu6", "reshape2", "transpose2", "concat", "slice",
+      "relu6",
+      "reshape2",
+      "transpose2",
+      "concat",
+      "slice",
   };
 
   framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph);
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index 083fc899119..b5ddacd440e 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -12,7 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h"
+
 #include <algorithm>
+#include <fstream>
+#include <iostream>
 #include <map>
 #include <memory>
 #include <set>
@@ -21,28 +25,22 @@
 #include <unordered_set>
 #include <vector>
 
-#include <fstream>
-#include <iostream>
-
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/subgraph_detector.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/lite/engine.h"
 #include "paddle/fluid/inference/lite/op_teller.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/subgraph_detector.h"
-#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h"
 #include "paddle/fluid/string/pretty_log.h"
 
-#include "paddle/fluid/inference/lite/engine.h"
-
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-using framework::ir::Node;
 using framework::ir::Agent;
-using framework::ir::SubGraphFuser;
 using framework::ir::Graph;
+using framework::ir::Node;
+using framework::ir::SubGraphFuser;
 
 namespace lite {
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h
index e79a64f0f72..198a86c185b 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h
@@ -14,10 +14,12 @@
 
 #pragma once
 #include <paddle/fluid/framework/ir/fuse_pass_base.h>
+
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc
index 90ad7ec0b44..8c88e2869cc 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h"
 #include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/inference/lite/op_teller.h"
 
@@ -29,7 +30,7 @@ void AppendLiteSubBlocks(const std::vector<framework::OpDesc*>& subgraph_ops,
                          framework::ProgramDesc* engine_program,
                          framework::ProgramDesc* host_program,
                          const int32_t host_sub_id);
-}
+}  // namespace lite
 
 TEST(LiteSubgraphPass, basic) {
   framework::ProgramDesc host_program;
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
index 34192965297..05bda4e75c9 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+
 #include <memory>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
index 2c2113c06d9..fca431b5d77 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index 321716b1c8a..fca5e256342 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/io.h"
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
index adbde0433fa..e7ef23e791e 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
index 0f3633ca6fa..999fb4ad8d7 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/program_desc.h"
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
index 613eb04497e..5b20667d62a 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 614eea24a0e..a0c7a94cd1b 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 3fa417c2ea6..70620e8692c 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -61,7 +61,8 @@ void MemoryOptimizePass::CollectLifeCycle(
     auto reads = op_node->inputs;
     auto writes = op_node->outputs;
 
-    std::vector<Node*> requires(reads.begin(), reads.end());
+    std::vector<Node*>
+    requires(reads.begin(), reads.end());
     requires.insert(requires.end(), writes.begin(), writes.end());
 
     // Disable reuse of feed variables.
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
index 8ca5ffa2581..5dcd8b1059e 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -35,16 +35,15 @@ namespace inference {
 namespace analysis {
 
 /* Memory optimization.
-* We will perform the following operation:
-* 1. Collect all var's lifetime.
-* 2. Make reuse plan: the vars can be reused if there is no overlap(on lifetime)
-* between
-* them.
-* The final plan is a mapping table in which the key represents the original
-* name of var and the value in the table represents the current name of var.
-* 3. Perform reuse plan: Replace all var's name in the model according to the
-* mapping table.
-*/
+ * We will perform the following operation:
+ * 1. Collect all var's lifetime.
+ * 2. Make reuse plan: the vars can be reused if there is no overlap(on
+ * lifetime) between them. The final plan is a mapping table in which the key
+ * represents the original name of var and the value in the table represents the
+ * current name of var.
+ * 3. Perform reuse plan: Replace all var's name in the model according to the
+ * mapping table.
+ */
 class MemoryOptimizePass : public AnalysisPass {
  public:
   using space_table_t = std::unordered_map<std::string, size_t>;
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc
index ca0b25c29d4..19aab1a948d 100644
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/passes.h"
+
 #include "paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h"
 #include "paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
diff --git a/paddle/fluid/inference/analysis/passes/passes.h b/paddle/fluid/inference/analysis/passes/passes.h
index 8a13091d083..b3b240c280c 100644
--- a/paddle/fluid/inference/analysis/passes/passes.h
+++ b/paddle/fluid/inference/analysis/passes/passes.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index 56565c8f3f7..6c7690a4779 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <gtest/gtest.h>
+
 #include <fstream>
 #include <string>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/helper.h"
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 5bb26d8f080..c23397a0828 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -15,6 +15,7 @@
 #include <sstream>
 #include <string>
 #include <tuple>
+
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/table_printer.h"
@@ -1105,8 +1106,9 @@ LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheBuffers(
                     platform::errors::InvalidArgument(
                         "model_cache_buffer should not be empty."));
   PADDLE_ENFORCE_EQ(nnadapter_model_cache_buffers.count(model_cache_token),
-                    false, platform::errors::InvalidArgument(
-                               "model_cache_token has already been set."));
+                    false,
+                    platform::errors::InvalidArgument(
+                        "model_cache_token has already been set."));
 
   nnadapter_model_cache_buffers[model_cache_token] = model_cache_buffer;
   return *this;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index b40377855bd..5f9051ff2fd 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -83,9 +83,9 @@ namespace paddle {
 
 using inference::Singleton;
 #if PADDLE_WITH_TENSORRT
-using inference::tensorrt::TRTInt8Calibrator;
 using inference::tensorrt::TRTCalibratorEngine;
 using inference::tensorrt::TRTCalibratorEngineManager;
+using inference::tensorrt::TRTInt8Calibrator;
 #endif
 
 int AnalysisPredictor::clone_num_ = 1;
@@ -1027,8 +1027,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 }
 
 template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+    const AnalysisConfig &config) {
   // TODO(NHZlX): Should add the link to the doc of
   // paddle_infer::CreatePredictor<paddle_infer::Config>
   if (config.glog_info_disabled()) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index e96526730fd..1cfdaf1a558 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -41,7 +41,7 @@ using float16 = paddle::platform::float16;
 namespace experimental {
 class InternalUtils;
 };
-}
+}  // namespace paddle_infer
 ///
 /// \file analysis_predictor.h
 ///
@@ -55,10 +55,10 @@ class InternalUtils;
 
 namespace paddle {
 
-using inference::analysis::Argument;
-using inference::analysis::Analyzer;
-using framework::proto::ProgramDesc;
 using framework::NaiveExecutor;
+using framework::proto::ProgramDesc;
+using inference::analysis::Analyzer;
+using inference::analysis::Argument;
 
 ///
 /// \class AnalysisPredictor
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index e8a1384166a..f16054565a7 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -18,7 +18,9 @@
 #endif
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <thread>  // NOLINT
+
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/api/helper.h"
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index e2befadf0a8..9e4633774a2 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <sstream>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/commit.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 1c4369af646..38960aecb70 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/inference/api/api_impl.h"
+
 #include <glog/logging.h>
+
 #include <memory>
 #include <sstream>
 #include <string>
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/place.h"
@@ -348,8 +350,9 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 }
 
 template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+    const NativeConfig &config) {
   // TODO(NHZlX): Should add the link to the doc of
   // paddle_infer::CreatePredictor<paddle_infer::Config>
   VLOG(3) << "create NativePaddlePredictor";
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index b91eff4573e..d503d258139 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <glog/logging.h>
+
 #include <map>
 #include <memory>
 #include <string>
diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc
index 46724fa6b1a..1faf46fad2b 100644
--- a/paddle/fluid/inference/api/api_tester.cc
+++ b/paddle/fluid/inference/api/api_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <exception>
 #include <string>
 
diff --git a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
index ef5c08cd041..f9ac07a8304 100644
--- a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
@@ -17,7 +17,9 @@ limitations under the License. */
  */
 
 #include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "utils.h"  // NOLINT
 
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index 9edb4ecbfd2..551b66fcaf7 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -17,6 +17,7 @@ limitations under the License. */
  */
 
 #include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+
 #include "gflags/gflags.h"
 #include "utils.h"  // NOLINT
 
diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h
index b4f40194aa9..dfba4b8ebf6 100644
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -14,11 +14,13 @@
 
 #pragma once
 #include <math.h>
+
 #include <algorithm>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
+
 #include "paddle/include/paddle_inference_api.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 818444fbcb6..352efc1e63d 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -17,6 +17,7 @@ limitations under the License. */
  */
 
 #include <glog/logging.h>
+
 #include "gflags/gflags.h"
 #include "utils.h"  // NOLINT
 
diff --git a/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc b/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc
index 8d0538f8fa5..b1f770066e7 100644
--- a/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc
+++ b/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc
@@ -13,14 +13,15 @@
 // limitations under the License.
 
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <fstream>
 #include <iostream>
 #include <numeric>
 #include <string>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/include/paddle_inference_api.h"
 
 DEFINE_string(modeldir, "", "Directory of the inference model.");
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index bb966dc5c6c..661d9def406 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -340,8 +340,9 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 #ifdef PADDLE_WITH_MKLDNN
     if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN)
       paddle::framework::innerTransDataLayoutFromMKLDNN(
-          tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls()
-                                .get_cur_paddle_data_layout(),
+          tensor->layout(),
+          paddle::platform::MKLDNNDeviceContext::tls()
+              .get_cur_paddle_data_layout(),
           *tensor, &out, paddle::platform::CPUPlace(), true);
     else
       std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
@@ -852,8 +853,9 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data,
 #ifdef PADDLE_WITH_MKLDNN
     if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN)
       paddle::framework::innerTransDataLayoutFromMKLDNN(
-          tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls()
-                                .get_cur_paddle_data_layout(),
+          tensor->layout(),
+          paddle::platform::MKLDNNDeviceContext::tls()
+              .get_cur_paddle_data_layout(),
           *tensor, &out, paddle::platform::CPUPlace(), true);
     else
       std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc
index b9e0e90a403..3454c5c8fd1 100644
--- a/paddle/fluid/inference/api/helper.cc
+++ b/paddle/fluid/inference/api/helper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/helper.h"
+
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index acc52ac0468..1c58b004e6d 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <glog/logging.h>
+
 #include <fstream>
 #if !defined(_WIN32)
 #include <sys/time.h>
@@ -377,8 +378,9 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                       double batch_latency, int epoch = 1,
                       const framework::proto::VarType::Type data_type =
                           framework::proto::VarType::FP32) {
-  PADDLE_ENFORCE_GT(batch_size, 0, platform::errors::InvalidArgument(
-                                       "Non-positive batch size."));
+  PADDLE_ENFORCE_GT(
+      batch_size, 0,
+      platform::errors::InvalidArgument("Non-positive batch size."));
   double sample_latency = batch_latency / batch_size;
   LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid
             << " ======";
diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h
index b7a8bf637d8..c2a23a7ca2c 100644
--- a/paddle/fluid/inference/api/infer_context.h
+++ b/paddle/fluid/inference/api/infer_context.h
@@ -25,21 +25,21 @@ class InferCPUContext : public phi::CPUContext {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class InferGPUContext : public phi::GPUContext {
  public:
-  using phi::GPUContext::SetStream;
-  using phi::GPUContext::SetEigenDevice;
   using phi::GPUContext::SetBlasHandle;
   using phi::GPUContext::SetBlasTensorCoreHandle;
   using phi::GPUContext::SetBlasTF32Handle;
   using phi::GPUContext::SetDnnHandle;
+  using phi::GPUContext::SetEigenDevice;
   using phi::GPUContext::SetSolverHandle;
   using phi::GPUContext::SetSparseHandle;
+  using phi::GPUContext::SetStream;
   // using phi::GPUContext::SetDnnWorkspaceHandle;
   using phi::GPUContext::SetComputeCapability;
+  using phi::GPUContext::SetDriverVersion;
+  using phi::GPUContext::SetMaxGridDimSize;
+  using phi::GPUContext::SetMaxThreadsPerBlock;
   using phi::GPUContext::SetMaxThreadsPerMultiProcessor;
   using phi::GPUContext::SetMultiProcessors;
-  using phi::GPUContext::SetMaxThreadsPerBlock;
-  using phi::GPUContext::SetMaxGridDimSize;
-  using phi::GPUContext::SetDriverVersion;
   using phi::GPUContext::SetRuntimeVersion;
 };
 #endif
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 4dc80a1d753..73096973c38 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/mkldnn_quantizer.h"
+
 #include <algorithm>
 #include <limits>
 #include <map>
 #include <numeric>
 #include <unordered_map>
 #include <utility>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -33,10 +35,10 @@
 
 namespace paddle {
 
-using platform::CPUPlace;
 using framework::LoDTensor;
 using framework::Variable;
 using framework::ir::Graph;
+using platform::CPUPlace;
 using ConstEigenVectorArrayMap =
     Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
 using EigenMatrixDoubleArray =
@@ -57,8 +59,9 @@ static void check_var(const Variable* var, const std::string& var_name) {
 }
 
 static void check_tensor(const LoDTensor& tensor) {
-  PADDLE_ENFORCE_GT(tensor.dims().size(), 0, platform::errors::InvalidArgument(
-                                                 "Tensor dimension is empty."));
+  PADDLE_ENFORCE_GT(
+      tensor.dims().size(), 0,
+      platform::errors::InvalidArgument("Tensor dimension is empty."));
 }
 
 void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForRNNWeights(
@@ -531,8 +534,9 @@ AnalysisPredictor::MkldnnQuantizer::Histogram(
   PADDLE_ENFORCE_GE(max_val, min_val,
                     platform::errors::InvalidArgument(
                         "MkldnnQuantizer: To calculate Histogram, max_val (" +
-                        std::to_string(max_val) + ") must be greater or equal"
-                                                  "to min_val (" +
+                        std::to_string(max_val) +
+                        ") must be greater or equal"
+                        "to min_val (" +
                         std::to_string(min_val) + ")."));
   ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
                                         var_tensor.numel(), 1};
@@ -570,7 +574,8 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
 
   auto* builder = predictor_.config_.pass_builder();
   builder->SetPasses({
-      "cpu_quantize_pass", "cpu_quantize_squash_pass",
+      "cpu_quantize_pass",
+      "cpu_quantize_squash_pass",
       "int8_scale_calculation_mkldnn_pass",
   });
   if (predictor_.config_.ir_debug_) builder->TurnOnDebug();
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h
index 5e7aa39de52..811f2941a7d 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.h
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc b/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc
index 2bee4763d4f..05077f8ba34 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 DEFINE_string(dirname, "", "dirname to tests.");
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
index d01756e4b96..294a83a4335 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.h
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -18,6 +18,9 @@
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "onnxruntime_c_api.h"    // NOLINT
+#include "onnxruntime_cxx_api.h"  // NOLINT
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_compatible_info.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
@@ -27,9 +30,6 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/string/printf.h"
-
-#include "onnxruntime_c_api.h"    // NOLINT
-#include "onnxruntime_cxx_api.h"  // NOLINT
 #include "paddle2onnx/converter.h"
 
 #ifdef PADDLE_WITH_TESTING
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
index 4a702edacc9..ff8528c0850 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
@@ -12,16 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
-
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index ab2265bff24..489c32bc59d 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -912,11 +912,18 @@ struct PD_INFER_DECL AnalysisConfig {
   bool thread_local_stream_{false};
   bool use_gpu_fp16_{false};
   std::unordered_set<std::string> gpu_fp16_disabled_op_types_{
-      "conv2d_fusion", "conv2d", "roll", "strided_slice", "depthwise_conv2d",
-      "unfold", "generate_proposals_v2", "nearest_interp_v2",
+      "conv2d_fusion",
+      "conv2d",
+      "roll",
+      "strided_slice",
+      "depthwise_conv2d",
+      "unfold",
+      "generate_proposals_v2",
+      "nearest_interp_v2",
       "bilinear_interp_v2"
       "yolo_box",
-      "multiclass_nms3", "matrix_nms"};
+      "multiclass_nms3",
+      "matrix_nms"};
 
   bool use_cudnn_{false};
 
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 711998e9956..78af756c24b 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -27,6 +27,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "crypto/cipher.h"
 #include "paddle_infer_declare.h"  // NOLINT
 #include "paddle_tensor.h"         // NOLINT
@@ -391,12 +392,14 @@ PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
     const ConfigT& config);
 
 template <>
-PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    NativeConfig, PaddleEngineKind::kNative>(const NativeConfig& config);
+PD_INFER_DECL std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+    const NativeConfig& config);
 
 template <>
-PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config);
+PD_INFER_DECL std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+    const AnalysisConfig& config);
 
 template <>
 PD_INFER_DECL std::unique_ptr<PaddlePredictor>
diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc
index d27f20a93b3..e785e91a671 100644
--- a/paddle/fluid/inference/api/paddle_infer_contrib.cc
+++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/paddle_infer_contrib.h"
+
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 04e77faf2e3..9e5b76db4ac 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -20,6 +20,7 @@
 #include <miopen/miopen.h>
 #endif
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <sstream>
 
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index c41968dc585..24e76598e40 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -15,6 +15,7 @@
 
 #include <functional>
 #include <memory>
+
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/backends/cpu/forwards.h"
 
diff --git a/paddle/fluid/inference/capi/c_api.cc b/paddle/fluid/inference/capi/c_api.cc
index 07493c742c4..f2a9838f4bc 100644
--- a/paddle/fluid/inference/capi/c_api.cc
+++ b/paddle/fluid/inference/capi/c_api.cc
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/capi/c_api_internal.h b/paddle/fluid/inference/capi/c_api_internal.h
index 7e69b721076..11728fb9878 100644
--- a/paddle/fluid/inference/capi/c_api_internal.h
+++ b/paddle/fluid/inference/capi/c_api_internal.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index 9bb52ba5780..2bacc94c0d1 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -18,6 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index 12d7f78e169..e88fbfc5a86 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -19,6 +19,7 @@
 #include <memory>
 #include <numeric>
 #include <vector>
+
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
diff --git a/paddle/fluid/inference/capi/pd_tensor.cc b/paddle/fluid/inference/capi/pd_tensor.cc
index 9b1eedd7c5a..199db92d1b0 100644
--- a/paddle/fluid/inference/capi/pd_tensor.cc
+++ b/paddle/fluid/inference/capi/pd_tensor.cc
@@ -17,6 +17,7 @@
 #include <cstring>
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/capi_exp/lod_demo.cc b/paddle/fluid/inference/capi_exp/lod_demo.cc
index 2b049e992e7..c67d6f870bd 100644
--- a/paddle/fluid/inference/capi_exp/lod_demo.cc
+++ b/paddle/fluid/inference/capi_exp/lod_demo.cc
@@ -27,8 +27,10 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 
 int main(int argc, char *argv[]) {
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
index d290f44d2ee..4e1c5a2a0dd 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/capi_exp/pd_config.h"
+
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/capi_exp/pd_types.h"
 #include "paddle/fluid/inference/capi_exp/utils_internal.h"
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.cc b/paddle/fluid/inference/capi_exp/pd_predictor.cc
index 5ca58b0e413..c85dfdf522e 100644
--- a/paddle/fluid/inference/capi_exp/pd_predictor.cc
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/capi_exp/pd_predictor.h"
+
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/capi_exp/pd_types.h"
 #include "paddle/fluid/inference/capi_exp/pd_utils.h"
diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.cc b/paddle/fluid/inference/capi_exp/pd_tensor.cc
index 9c661dea6f2..520cfa813f4 100644
--- a/paddle/fluid/inference/capi_exp/pd_tensor.cc
+++ b/paddle/fluid/inference/capi_exp/pd_tensor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/capi_exp/pd_tensor.h"
+
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/capi_exp/pd_types.h"
 #include "paddle/fluid/inference/capi_exp/pd_utils.h"
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc
index efca350fbaf..7942a860c4e 100644
--- a/paddle/fluid/inference/capi_exp/pd_utils.cc
+++ b/paddle/fluid/inference/capi_exp/pd_utils.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+
 #include <string>
 
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/capi_exp/pd_utils.h"
 #include "paddle/fluid/inference/capi_exp/utils_internal.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp
index 593ba3cb51d..efea093fa24 100644
--- a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp
+++ b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Config.cpp
@@ -13,9 +13,10 @@
 // limitations under the License.
 
 #include "com_baidu_paddle_inference_Config.h"
+
 #include <iostream>
-#include "jni_convert_util.h"  // NOLINT
 
+#include "jni_convert_util.h"  // NOLINT
 #include "pd_inference_api.h"  // NOLINT
 
 JNIEXPORT void JNICALL Java_com_baidu_paddle_inference_Config_cppConfigDestroy(
diff --git a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Predictor.cpp b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Predictor.cpp
index 7eff03690ae..0912c2ad57a 100644
--- a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Predictor.cpp
+++ b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Predictor.cpp
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "com_baidu_paddle_inference_Predictor.h"
+
 #include <jni.h>
+
 #include "jni_convert_util.h"  // NOLINT
 #include "pd_inference_api.h"  // NOLINT
 
diff --git a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Tensor.cpp b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Tensor.cpp
index b9be4a73ac2..a90ae165ebd 100644
--- a/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Tensor.cpp
+++ b/paddle/fluid/inference/experimental/javaapi/native/com_baidu_paddle_inference_Tensor.cpp
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "com_baidu_paddle_inference_Tensor.h"
+
 #include <jni.h>
+
 #include "pd_inference_api.h"  // NOLINT
 
 JNIEXPORT void JNICALL Java_com_baidu_paddle_inference_Tensor_cppTensorDestroy(
diff --git a/paddle/fluid/inference/experimental/javaapi/native/jni_convert_util.h b/paddle/fluid/inference/experimental/javaapi/native/jni_convert_util.h
index 0026ec2f410..c363559298f 100644
--- a/paddle/fluid/inference/experimental/javaapi/native/jni_convert_util.h
+++ b/paddle/fluid/inference/experimental/javaapi/native/jni_convert_util.h
@@ -17,6 +17,7 @@
 
 #include <jni.h>
 #include <string.h>
+
 #include <string>
 #include <vector>
 
@@ -54,8 +55,8 @@ inline jstring cpp_string_to_jstring(JNIEnv *env, std::string str) {
                           reinterpret_cast<const jbyte *>(data));
 
   jstring encoding = env->NewStringUTF("UTF-8");
-  jstring res = (jstring)(
-      env->NewObject(strClass, strClassInitMethodID, bytes, encoding));
+  jstring res = (jstring)(env->NewObject(strClass, strClassInitMethodID, bytes,
+                                         encoding));
 
   env->DeleteLocalRef(strClass);
   env->DeleteLocalRef(encoding);
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index 317ef9d93ac..1106ad261ec 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index cd78cfecd86..8f8f68b170b 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -25,6 +25,7 @@
 #endif
 
 #include "paddle/fluid/inference/lite/engine.h"
+
 #include <utility>
 
 namespace paddle {
diff --git a/paddle/fluid/inference/lite/op_teller.cc b/paddle/fluid/inference/lite/op_teller.cc
index 3a162c3fde1..3d2ed0a5c98 100644
--- a/paddle/fluid/inference/lite/op_teller.cc
+++ b/paddle/fluid/inference/lite/op_teller.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/inference/lite/op_teller.h"
+
 #include <map>
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/lite/engine.h"
-#include "paddle/fluid/inference/lite/op_teller.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/lite/op_teller.h b/paddle/fluid/inference/lite/op_teller.h
index b9391a98a2e..1a969f1293d 100644
--- a/paddle/fluid/inference/lite/op_teller.h
+++ b/paddle/fluid/inference/lite/op_teller.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/op_desc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index eeaa1282903..f70455f18eb 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/lite/tensor_utils.h"
+
 #include <functional>
 #include <map>
 #include <memory>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/inference/lite/engine.h"
@@ -26,9 +28,9 @@ namespace inference {
 namespace lite {
 namespace utils {
 
-using paddle::lite_api::TargetType;
-using paddle::lite_api::PrecisionType;
 using paddle::lite_api::DataLayoutType;
+using paddle::lite_api::PrecisionType;
+using paddle::lite_api::TargetType;
 
 template <typename DstLoD, typename SrcLoD>
 void SetLoD(DstLoD* dst, const SrcLoD& src) {
diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc
index 85f7d3ee363..dee83f70ba2 100644
--- a/paddle/fluid/inference/lite/test_engine_lite.cc
+++ b/paddle/fluid/inference/lite/test_engine_lite.cc
@@ -14,14 +14,12 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/inference/utils/singleton.h"
-
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-
 #include "paddle/fluid/inference/lite/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/operators/lite/ut_helper.h"
 
 namespace paddle {
@@ -29,9 +27,9 @@ namespace inference {
 namespace lite {
 
 using inference::lite::AddTensorToBlockDesc;
-using paddle::inference::lite::AddFetchListToBlockDesc;
 using inference::lite::CreateTensor;
 using inference::lite::serialize_params;
+using paddle::inference::lite::AddFetchListToBlockDesc;
 
 void make_fake_model(std::string* model, std::string* param) {
   framework::ProgramDesc program;
diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc
index b0c7c7448a5..09a6cda62b3 100644
--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/lite/tensor_utils.h"
 
@@ -21,9 +22,9 @@ namespace inference {
 namespace lite {
 namespace utils {
 
-using paddle::lite_api::TargetType;
-using paddle::lite_api::PrecisionType;
 using paddle::lite_api::DataLayoutType;
+using paddle::lite_api::PrecisionType;
+using paddle::lite_api::TargetType;
 
 TEST(LiteEngineOp, GetNativePlace) {
   ::testing::FLAGS_gtest_death_test_style = "threadsafe";
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index b86351e394b..2ef8ec16c76 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <NvInfer.h>
+
 #include <string>
 
 #include "glog/logging.h"
diff --git a/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc b/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc
index 2bbe6ea3d2f..df6c601500c 100644
--- a/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <cstdio>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
index e08f50833ed..c293282b761 100644
--- a/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
@@ -50,10 +50,11 @@ class FlattenContiguousRangeOpConverter : public OpConverter {
       for (int i = 0, j = 0; i < dims; ++i) {
         if (start_axis <= i + 1 && i + 1 <= stop_axis) {
           int dim_i = input_dim.d[i];
-          PADDLE_ENFORCE_GT(dim_i, 0, platform::errors::InvalidArgument(
-                                          "flatten_contiguous_range input dim "
-                                          "should be > 0, but got %d.",
-                                          dim_i));
+          PADDLE_ENFORCE_GT(dim_i, 0,
+                            platform::errors::InvalidArgument(
+                                "flatten_contiguous_range input dim "
+                                "should be > 0, but got %d.",
+                                dim_i));
           dim_prod *= dim_i;
           if (i + 1 == stop_axis) {
             flatten_dim.d[j++] = dim_prod;
diff --git a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
index 910a807d362..2a62f9009e2 100644
--- a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
index b468518fa5a..02e9610ea1e 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
@@ -13,15 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
+
 #include <cuda.h>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-using platform::is_gpu_place;
 using platform::is_cpu_place;
+using platform::is_gpu_place;
 
 class DefaultIOConverter : public EngineIOConverter {
  public:
@@ -49,8 +51,9 @@ class DefaultIOConverter : public EngineIOConverter {
           out, in.data<float>(), size, cudaMemcpyHostToDevice, *stream_));
     } else if (is_gpu_place(place)) {
       PADDLE_ENFORCE_EQ(
-          0, cudaMemcpyAsync(out, in.data<float>(), size,
-                             cudaMemcpyDeviceToDevice, *stream_),
+          0,
+          cudaMemcpyAsync(out, in.data<float>(), size, cudaMemcpyDeviceToDevice,
+                          *stream_),
           platform::errors::External(
               "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error."));
     } else {
@@ -78,14 +81,16 @@ class DefaultIOConverter : public EngineIOConverter {
             "But out's memory_size = %u, max_size = %u.",
             size, max_size));
     if (is_cpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
-                                           cudaMemcpyDeviceToHost, *stream_),
+      PADDLE_ENFORCE_EQ(0,
+                        cudaMemcpyAsync(out->data<float>(), in, size,
+                                        cudaMemcpyDeviceToHost, *stream_),
                         platform::errors::External(
                             "cudaMemcpyAsync(cudaMemcpyDeviceToHost) error."));
     } else if (is_gpu_place(place)) {
       PADDLE_ENFORCE_EQ(
-          0, cudaMemcpyAsync(out->data<float>(), in, size,
-                             cudaMemcpyDeviceToDevice, *stream_),
+          0,
+          cudaMemcpyAsync(out->data<float>(), in, size,
+                          cudaMemcpyDeviceToDevice, *stream_),
           platform::errors::External(
               "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error."));
     } else {
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h
index 58c178028b8..3ff78a6dc7a 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
index a968ea2a2c4..ae392675339 100644
--- a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
index b0d67a5bf90..d630f7e9967 100644
--- a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 0a99b12edc2..077ba32ba89 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
@@ -268,14 +269,16 @@ class OpConverter {
           }
         }
         engine->DeclareInput(
-            input, FluidDataType2TRT(
-                       var->Proto()->type().lod_tensor().tensor().data_type()),
+            input,
+            FluidDataType2TRT(
+                var->Proto()->type().lod_tensor().tensor().data_type()),
             Vec2TRT_Dims(input_shape, input, true));
 #endif
       } else {
         engine->DeclareInput(
-            input, FluidDataType2TRT(
-                       var->Proto()->type().lod_tensor().tensor().data_type()),
+            input,
+            FluidDataType2TRT(
+                var->Proto()->type().lod_tensor().tensor().data_type()),
             Vec2TRT_Dims(var_shape, input));
       }
     }
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 46e6c18bfb8..66acee964cd 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 1ad82df4173..7a034f2c166 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
index 92e34e48bdb..caa9e9ee289 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
index 6c876964297..b1319312adf 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
index a856d141444..0b9f4a5fd84 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
index cf377396087..2d77b9b32db 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
@@ -12,6 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
index 9c6ea51fe5a..5221843db19 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
index 8134d389469..4647521dd32 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
index 8f91309a0a0..a2fe32b75f3 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
index f17e00de0ee..f7984dd0ab7 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc
index c84c30255fa..d2dbb7fb592 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index 86cb7543d42..35b8fe1ee6a 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc
index f5ab6a99249..96b14c4e40c 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 9bfae64fe80..9a4d4db3435 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
 #include <gtest/gtest.h>  // NOLINT
 
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
index ba35d7ddbb2..a8e36f827d8 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index 36f13262a73..b917aa865d2 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -12,7 +12,9 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 #include <gtest/gtest.h>
+
 #include <fstream>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
index f2541ff7c0b..d71cf051972 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc
index 3ebb51afdf4..b5e640ea244 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
index 9cd5e811415..babe682ab4e 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
@@ -12,6 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
index 3b6a4a80044..1d23aeedc5a 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc
index 7a5a886affe..94ca6f0ed46 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/unary_op.cc b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
index aa3d38ebe20..72d5cb2aeb4 100644
--- a/paddle/fluid/inference/tensorrt/convert/unary_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <NvInfer.h>
+
 #include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
index 17d217dff43..f5ab63daa88 100644
--- a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 00a6b2ffbf9..7f308fd3a04 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <NvInfer.h>
 #include <glog/logging.h>
+
 #include <string>
 
 #include "cuda_runtime_api.h"  // NOLINT
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 598d751ad5f..b28fe827156 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <NvInfer.h>
+
 #include <map>
 #include <memory>
 #include <mutex>  // NOLINT
@@ -151,7 +152,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
     return dims;
   }
 }
-}  // NOLINT
+}  // namespace
 
 class TRTInt8Calibrator;
 
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index b8051d86104..e283000cdac 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -17,9 +17,11 @@
 #include <NvInfer.h>
 #include <cuda.h>
 #include <glog/logging.h>
+
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/platform/dynload/tensorrt.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 79a5e7d7a6a..dc7c77bc66a 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+
 #include <bitset>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout.h"
 
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
index 0a0cbeae51b..40f1a0055c7 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
index e5584f26580..a339f880ac3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
@@ -14,6 +14,7 @@
 
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+
 #include <algorithm>
 #include <cassert>
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
index 6128f8f0e41..7ea664ded66 100644
--- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+
 #include <algorithm>
 #include <cstdio>
 
@@ -88,9 +89,10 @@ DeformableConvPlugin::DeformableConvPlugin(
   dilations_.insert(dilations_.end(), dilations.cbegin(), dilations.cend());
   PADDLE_ENFORCE_EQ(data_type_ == nvinfer1::DataType::kFLOAT ||
                         data_type_ == nvinfer1::DataType::kHALF,
-                    true, platform::errors::InvalidArgument(
-                              "The DeformableConv TRT Plugin's input type "
-                              "should be float or half."));
+                    true,
+                    platform::errors::InvalidArgument(
+                        "The DeformableConv TRT Plugin's input type "
+                        "should be float or half."));
   PADDLE_ENFORCE_EQ(
       paddings_.size(), strides_.size(),
       platform::errors::InvalidArgument(
@@ -124,9 +126,10 @@ DeformableConvPlugin::DeformableConvPlugin(
   output_dim_.insert(output_dim_.end(), output_dim.cbegin(), output_dim.cend());
   PADDLE_ENFORCE_EQ(data_type_ == nvinfer1::DataType::kFLOAT ||
                         data_type_ == nvinfer1::DataType::kHALF,
-                    true, platform::errors::InvalidArgument(
-                              "The DeformableConv TRT Plugin's input type "
-                              "should be float or half."));
+                    true,
+                    platform::errors::InvalidArgument(
+                        "The DeformableConv TRT Plugin's input type "
+                        "should be float or half."));
   PADDLE_ENFORCE_EQ(
       paddings_.size(), strides_.size(),
       platform::errors::InvalidArgument(
@@ -363,13 +366,11 @@ __global__ void ModulatedDeformableIm2colGpuKernel<float>(
     const float* data_im_ptr =
         data_im + (b_col * num_channels + c_im) * height * width;
     const float* data_offset_ptr =
-        data_offset +
-        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
-            kernel_w * height_col * width_col;
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const float* data_mask_ptr =
-        data_mask +
-        (b_col * deformable_group + deformable_group_index) * kernel_h *
-            kernel_w * height_col * width_col;
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
 
     for (int i = 0; i < kernel_h; ++i) {
       for (int j = 0; j < kernel_w; ++j) {
@@ -432,13 +433,11 @@ __global__ void ModulatedDeformableIm2colGpuKernel<half>(
     const half* data_im_ptr =
         data_im + (b_col * num_channels + c_im) * height * width;
     const half* data_offset_ptr =
-        data_offset +
-        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
-            kernel_w * height_col * width_col;
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const half* data_mask_ptr =
-        data_mask +
-        (b_col * deformable_group + deformable_group_index) * kernel_h *
-            kernel_w * height_col * width_col;
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
 
     for (int i = 0; i < kernel_h; ++i) {
       for (int j = 0; j < kernel_w; ++j) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
index 1070a88cee7..5f4abee2838 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
 
 namespace paddle {
@@ -67,14 +68,16 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data,
 
 nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
     int index, const nvinfer1::Dims *input_dims, int num_inputs) TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "There is only one output in TRT elementwise "
-                                  "op plugin, but got output index: %d.",
-                                  index));
-  PADDLE_ENFORCE_EQ(num_inputs, 2, platform::errors::InvalidArgument(
-                                       "There are 2 inputs in TRT elementwise "
-                                       "op plugin, but got input number: %d.",
-                                       num_inputs));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "There is only one output in TRT elementwise "
+                        "op plugin, but got output index: %d.",
+                        index));
+  PADDLE_ENFORCE_EQ(
+      num_inputs, 2,
+      platform::errors::InvalidArgument("There are 2 inputs in TRT elementwise "
+                                        "op plugin, but got input number: %d.",
+                                        num_inputs));
   PADDLE_ENFORCE_NOT_NULL(
       input_dims,
       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
index aa1ab5389a5..51fc1bebd90 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 82f4420a2a0..6c7530cdc1f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include <stdio.h>
+
 #include <cassert>
 #include <cub/cub.cuh>  // NOLINT
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -253,10 +255,11 @@ nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types,
     int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(
-      index, 0, platform::errors::InvalidArgument(
-                    "The EmbEltwiseLayernorm Plugin only has one input, so the "
-                    "index value should be 0, but get %d.",
-                    index));
+      index, 0,
+      platform::errors::InvalidArgument(
+          "The EmbEltwiseLayernorm Plugin only has one input, so the "
+          "index value should be 0, but get %d.",
+          index));
   if (with_fp16_)
     return nvinfer1::DataType::kHALF;
   else
diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
index 841fb2f6fe3..f27b66b03f5 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
@@ -15,9 +15,11 @@
 #pragma once
 
 #include <thrust/device_vector.h>
+
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
index 08b259e0f95..cba1bb04c36 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
@@ -15,6 +15,7 @@
 #include <cassert>
 #include <cstring>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -112,15 +113,15 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
     VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp32";
     const float* input = static_cast<const float*>(inputs[0]);
     float* output = static_cast<float*>(outputs[0]);
-    gelu_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>(
-        kA, num, input, output);
+    gelu_kernel<float, block_size>
+        <<<grid_size, block_size, 0, stream>>>(kA, num, input, output);
   } else if (type == nvinfer1::DataType::kHALF) {
     VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp16";
     const half* input = static_cast<const half*>(inputs[0]);
     half* output = static_cast<half*>(outputs[0]);
-    no_exact_gelu_kernel<half,
-                         block_size><<<grid_size, block_size, 0, stream>>>(
-        kAT, kBT, kCT, num, input, output);
+    no_exact_gelu_kernel<half, block_size>
+        <<<grid_size, block_size, 0, stream>>>(kAT, kBT, kCT, num, input,
+                                               output);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "The Gelu TRT Plugin's input type should be float or half."));
@@ -170,10 +171,11 @@ bool GeluPluginDynamic::supportsFormatCombination(
 nvinfer1::DataType GeluPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType* input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The Gelu Plugin only has one input, so the "
-                                  "index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Gelu Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
   return input_types[0];
 }
 
@@ -192,15 +194,15 @@ int GeluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp32";
     const float* input = static_cast<const float*>(inputs[0]);
     float* output = static_cast<float*>(outputs[0]);
-    gelu_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>(
-        kA, num, input, output);
+    gelu_kernel<float, block_size>
+        <<<grid_size, block_size, 0, stream>>>(kA, num, input, output);
   } else if (input_type == nvinfer1::DataType::kHALF) {
     VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp16";
     const half* input = static_cast<const half*>(inputs[0]);
     half* output = static_cast<half*>(outputs[0]);
-    no_exact_gelu_kernel<half,
-                         block_size><<<grid_size, block_size, 0, stream>>>(
-        kAT, kBT, kCT, num, input, output);
+    no_exact_gelu_kernel<half, block_size>
+        <<<grid_size, block_size, 0, stream>>>(kAT, kBT, kCT, num, input,
+                                               output);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "The Gelu TRT Plugin's input type should be float or half."));
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
index 7efdd2798b2..8436ccad78a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
@@ -14,9 +14,11 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <cassert>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
index 9872b1ff8d9..05ed76bd3c9 100644
--- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
@@ -14,6 +14,7 @@
 
 #include <cassert>
 #include <cstring>
+
 #include "paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h
index 475c908c13b..b1e693799bd 100644
--- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h
@@ -14,9 +14,11 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <cassert>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
index 03686aefc13..9acd688f707 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <stdio.h>
+
 #include <cassert>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index 67d44184a76..16e2a284d4b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <stdio.h>
+
 #include <cassert>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h"
 #include "paddle/phi/kernels/layer_norm_kernel.h"
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
index 9e8ce302833..42dfa2b8aa0 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
@@ -17,6 +17,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
diff --git a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h
index be8f1c418fc..9ca6ff29240 100644
--- a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 #include <cassert>
-
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu
index 6e268e7b0b3..f655d23e628 100644
--- a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <cstring>
+
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h"
 
@@ -38,11 +39,12 @@ bool MishPlugin::supportsFormat(
 nvinfer1::Dims MishPlugin::getOutputDimensions(int index,
                                                const nvinfer1::Dims* in_dims,
                                                int nb_inputs) TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(nb_inputs, 1, platform::errors::InvalidArgument(
-                                      "We expect [number of inputs] == 1"
-                                      "in TRT Mish op plugin, but got "
-                                      "[number of inputs] = %d.",
-                                      nb_inputs));
+  PADDLE_ENFORCE_EQ(
+      nb_inputs, 1,
+      platform::errors::InvalidArgument("We expect [number of inputs] == 1"
+                                        "in TRT Mish op plugin, but got "
+                                        "[number of inputs] = %d.",
+                                        nb_inputs));
   PADDLE_ENFORCE_LT(index, this->getNbOutputs(),
                     platform::errors::InvalidArgument(
                         "We expect [index] < [number of outputs]"
@@ -123,14 +125,14 @@ int MishPlugin::enqueue(int batchSize, const void* const* inputs,
     VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32";
     const float* input = static_cast<const float*>(inputs[0]);
     float* output = static_cast<float*>(outputs[0]);
-    mish_kernel<float><<<grid_size, block_size, 0, stream>>>(threshold_, num,
-                                                             input, output);
+    mish_kernel<float>
+        <<<grid_size, block_size, 0, stream>>>(threshold_, num, input, output);
   } else if (type == nvinfer1::DataType::kHALF) {
     VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16";
     const half* input = static_cast<const half*>(inputs[0]);
     half* output = static_cast<half*>(outputs[0]);
-    mish_kernel<half><<<grid_size, block_size, 0, stream>>>(threshold_, num,
-                                                            input, output);
+    mish_kernel<half>
+        <<<grid_size, block_size, 0, stream>>>(threshold_, num, input, output);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "The Mish TRT Plugin's input type should be float or half."));
@@ -192,10 +194,11 @@ bool MishPluginDynamic::supportsFormatCombination(
 nvinfer1::DataType MishPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType* input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The Mish Plugin only has one input, so the "
-                                  "index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Mish Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
   return input_types[0];
 }
 
@@ -214,14 +217,14 @@ int MishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32";
     const float* input = static_cast<const float*>(inputs[0]);
     float* output = static_cast<float*>(outputs[0]);
-    mish_kernel<float><<<grid_size, block_size, 0, stream>>>(threshold_, num,
-                                                             input, output);
+    mish_kernel<float>
+        <<<grid_size, block_size, 0, stream>>>(threshold_, num, input, output);
   } else if (input_type == nvinfer1::DataType::kHALF) {
     VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16";
     const half* input = static_cast<const half*>(inputs[0]);
     half* output = static_cast<half*>(outputs[0]);
-    mish_kernel<half><<<grid_size, block_size, 0, stream>>>(threshold_, num,
-                                                            input, output);
+    mish_kernel<half>
+        <<<grid_size, block_size, 0, stream>>>(threshold_, num, input, output);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "The Mish TRT Plugin's input type should be float or half."));
diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
index 75390666ea0..fdef7b93f32 100644
--- a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
@@ -14,8 +14,10 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
index 5596a89a083..40cb2b88e71 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
@@ -70,10 +70,11 @@ nvinfer1::Dims Pool3DPlugin::getOutputDimensions(
                         "The Pool3D Plugin only has one input, so the nbInputs "
                         "value should be 1, but get %d.",
                         nbInputs));
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The Pool3D Plugin only has one input, so "
-                                  "the index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Pool3D Plugin only has one input, so "
+                        "the index value should be 0, but get %d.",
+                        index));
   PADDLE_ENFORCE_EQ(inputDims[0].nbDims, 4,
                     platform::errors::InvalidArgument(
                         "The Pool3D Plugin only has four Dimensions, so the "
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h
index 7c9a8625d70..d54ce067e5e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h
@@ -14,9 +14,11 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <cassert>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
index 9bfe98d759d..80f7e349dac 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -240,10 +240,11 @@ bool PoolPluginDynamic::supportsFormatCombination(
 nvinfer1::DataType PoolPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The Pool Plugin only has one input, so the "
-                                  "index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Pool Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
   PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT), true,
                     platform::errors::InvalidArgument(
                         "The input type should be half or float"));
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h
index d1bf2cd02e8..155d69cc457 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h
@@ -14,9 +14,11 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <cassert>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
index 1ea2b8b5f6e..72c1d546e9a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -144,10 +144,11 @@ bool PReluPluginDynamic::supportsFormatCombination(
 nvinfer1::DataType PReluPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The PRelu Plugin only has one input, so the "
-                                  "index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The PRelu Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
   PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT), true,
                     platform::errors::InvalidArgument(
                         "The input type should be half or float"));
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
index e0a77de6f54..0025e1ee5b4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -17,9 +17,9 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index e2f1aab9b64..d3da5d7225d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include <stdio.h>
+
 #include <cassert>
 #include <cub/cub.cuh>  // NOLINT
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -103,8 +105,8 @@ inline void TransposeQKV(const int batch, const int seq_len,
                       platform::errors::InvalidArgument(
                           "head_num (%d) * head_size (%d) should <= %d",
                           head_num, head_size, 1024));
-    TransposeQkvKernel<float><<<grid, block, 0, stream>>>(head_size, input,
-                                                          output);
+    TransposeQkvKernel<float>
+        <<<grid, block, 0, stream>>>(head_size, input, output);
   }
 }
 
@@ -142,8 +144,8 @@ inline void TransposeQKV(const int batch, const int seq_len,
                       platform::errors::InvalidArgument(
                           "head_num (%d) * head_size (%d) should <= %d",
                           head_num, head_size, 1024));
-    TransposeQkvKernel<half><<<grid, block, 0, stream>>>(head_size, input,
-                                                         output);
+    TransposeQkvKernel<half>
+        <<<grid, block, 0, stream>>>(head_size, input, output);
   }
 }
 
@@ -218,10 +220,11 @@ nvinfer1::DataType QkvToContextPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types,
     int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(
-      index, 0, platform::errors::InvalidArgument(
-                    "The EmbEltwiseLayernorm Plugin only has one input, so the "
-                    "index value should be 0, but get %d.",
-                    index));
+      index, 0,
+      platform::errors::InvalidArgument(
+          "The EmbEltwiseLayernorm Plugin only has one input, so the "
+          "index value should be 0, but get %d.",
+          index));
   return input_types[0];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h
index 896cd05eef1..71b576610e2 100644
--- a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <cassert>
-
 #include <string>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h
index 6679f2f0819..89fda3dd775 100644
--- a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <cassert>
-
 #include <string>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
index 7dc31fb4471..7eded9e823e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
@@ -14,6 +14,7 @@
 
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+
 #include <algorithm>
 
 #include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h"
@@ -281,13 +282,12 @@ int RoiAlignPluginDynamic::enqueue_impl(
         width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
         aligned_, static_cast<OutT*>(outputs[0]));
   } else {
-    GPUROIAlignOpt<
-        T, OutT,
-        false><<<blocks, threads, width * height * sizeof(T), stream>>>(
-        output_size, static_cast<const T*>(inputs[0]),
-        static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
-        width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
-        aligned_, static_cast<OutT*>(outputs[0]));
+    GPUROIAlignOpt<T, OutT, false>
+        <<<blocks, threads, width * height * sizeof(T), stream>>>(
+            output_size, static_cast<const T*>(inputs[0]),
+            static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
+            width, pooled_height_, pooled_width_, sampling_ratio_,
+            rois_num / batch, aligned_, static_cast<OutT*>(outputs[0]));
   }
 
   return cudaGetLastError() != cudaSuccess;
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
index fb14749f3d1..e1527f85088 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -14,9 +14,11 @@
 
 #include <cuda_runtime.h>
 #include <stdio.h>
+
 #include <cassert>
 #include <cub/cub.cuh>  // NOLINT
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
@@ -105,8 +107,9 @@ nvinfer1::DataType SkipLayerNormPluginDynamic::getOutputDataType(
                         index));
   PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT ||
                      input_types[0] == nvinfer1::DataType::kHALF),
-                    true, platform::errors::InvalidArgument(
-                              "The input type should be half or float"));
+                    true,
+                    platform::errors::InvalidArgument(
+                        "The input type should be half or float"));
   return input_types[0];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
index 0a6d24f9072..ad426204d5a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
@@ -14,9 +14,11 @@
 
 #include <cuda_runtime.h>
 #include <stdio.h>
+
 #include <cassert>
 #include <cub/cub.cuh>  // NOLINT
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h"
 
@@ -301,14 +303,16 @@ bool SlicePluginDynamic::supportsFormatCombination(
 nvinfer1::DataType SlicePluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The Slice Plugin only has one input, so the "
-                                  "index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Slice Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
   PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT ||
                      input_types[0] == nvinfer1::DataType::kHALF),
-                    true, platform::errors::InvalidArgument(
-                              "The input type should be half or float"));
+                    true,
+                    platform::errors::InvalidArgument(
+                        "The input type should be half or float"));
   return input_types[0];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index ec4fcca6d74..1cfc9fade7b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <cuda_fp16.h>
+
 #include <algorithm>
+
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index 7a41fe1d1ee..49f028493ee 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -15,9 +15,11 @@
 #pragma once
 
 #include <thrust/device_vector.h>
+
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
index 74a6c3cdf3e..1c6dae78b38 100644
--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
@@ -15,6 +15,7 @@
 #include <cassert>
 #include <cstring>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h"
 
 namespace paddle {
@@ -128,8 +129,9 @@ bool StackPluginDynamic::supportsFormatCombination(
 nvinfer1::DataType StackPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType* input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be equal to 0"));
+  PADDLE_ENFORCE_EQ(
+      index, 0,
+      platform::errors::InvalidArgument("The index should be equal to 0"));
   return input_types[0];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
index 965c53e2698..12beafdadb3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
@@ -14,9 +14,11 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <cassert>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
index 2c2fad74b9a..1992dd57d68 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <stdio.h>
+
 #include <cassert>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h"
 
@@ -181,10 +183,11 @@ bool SwishPluginDynamic::supportsFormatCombination(
 nvinfer1::DataType SwishPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The Swish Plugin only has one input, so the "
-                                  "index value should be 0, but get %d.",
-                                  index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Swish Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
   return input_types[0];
 }
 
@@ -203,8 +206,8 @@ int SwishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     VLOG(1) << "TRT Plugin DataType selected. Swish-->fp32";
     const float *input = static_cast<const float *>(inputs[0]);
     float *output = static_cast<float *>(outputs[0]);
-    swish_kernel<float><<<blocks, threads, 0, stream>>>(num, input, output,
-                                                        beta_);
+    swish_kernel<float>
+        <<<blocks, threads, 0, stream>>>(num, input, output, beta_);
   } else if (input_type == nvinfer1::DataType::kHALF) {
     VLOG(1) << "TRT Plugin DataType selected. Swish-->fp16";
     const half *input = static_cast<const half *>(inputs[0]);
diff --git a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
index 46f585e6557..9cb680da5a9 100644
--- a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h b/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h
index 87dc876fa9c..92aa0c48a49 100644
--- a/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/transformer_input_convert_plugin.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <cassert>
-
 #include <string>
+
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index 9210cd48d07..a1316384cd4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <NvInfer.h>
+
 #include <cstring>
 #include <string>
 #include <unordered_map>
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
index 16751c764bd..cf9c66f0eb3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <type_traits>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h
index 2094dbfc9db..7116093ae36 100644
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
index 2f5b75c1020..70f36ec34b7 100644
--- a/paddle/fluid/inference/tensorrt/test_tensorrt.cc
+++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <cuda_runtime_api.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include "NvInfer.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/platform/dynload/tensorrt.h"
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
index c84cb45b7ec..35c776b9e53 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
@@ -16,6 +16,7 @@
 
 #include <NvInfer.h>
 #include <cuda_runtime_api.h>
+
 #include <atomic>
 #include <memory>
 #include <mutex>  // NOLINT
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
index d11d09458e4..ae838955adc 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
index d3a15cb2857..dfcf5fda476 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
index 4369cd78dfa..db5406b8ef6 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
@@ -15,8 +15,10 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
index a341ffd7a08..8b094e8a6cb 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
index f4017fc5a7f..33685e6a960 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
@@ -15,11 +15,13 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
index 8951c446b1f..f59b337d6af 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
@@ -15,11 +15,13 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
index a84c19de255..347f0e6e253 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
index c60e0a25f28..524d39854de 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_int_tester.cc
index c0c8ff083de..cf8582ee778 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_int_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_int_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_ner_tester.cc
index bf0576f9f93..b74f51af980 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_ner_tester.cc
@@ -15,8 +15,10 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
index a9c24c4503f..d0cd55e918e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
@@ -15,11 +15,13 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
@@ -69,8 +71,9 @@ void PD_run() {
   PD_DeletePaddleTensor(input);
   int size;
   const int* out_shape = PD_GetPaddleTensorShape(out_data, &size);
-  PADDLE_ENFORCE_EQ(size, 2, paddle::platform::errors::InvalidArgument(
-                                 "The Output shape's size is NOT match."));
+  PADDLE_ENFORCE_EQ(size, 2,
+                    paddle::platform::errors::InvalidArgument(
+                        "The Output shape's size is NOT match."));
   std::vector<int> ref_outshape_size({9, 6});
   for (int i = 0; i < 2; ++i) {
     PADDLE_ENFORCE_EQ(out_shape[i], ref_outshape_size[i],
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
index 0b2be0076fd..4ff3e27f420 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc
index 33a67d81405..e6a6a8c1037 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_xpu_tester.cc
@@ -15,8 +15,10 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 820bbf07017..e3bdb98ec52 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <vector>
+
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
index 384bef8a4b4..c21785f7ce7 100644
--- a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
index 5333f0052d7..166bdc621c1 100644
--- a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 DEFINE_string(infer_shape, "", "data shape file");
diff --git a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
index af0a51e4ddb..cf3380d0406 100644
--- a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 DEFINE_bool(disable_mkldnn_fc, false, "Disable usage of MKL-DNN's FC op");
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
index d11b5f0c218..c6d266ceb21 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
index 57ab1b00908..18990dba314 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index bd3a1d737af..2b69a15e26a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -148,8 +148,9 @@ TEST(Analyzer_LAC, profile) {
                           "The size of output should be equal to 1."));
     size_t size = GetSize(output[0]);
     size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
-    PADDLE_ENFORCE_GE(size, batch1_size, paddle::platform::errors::Fatal(
-                                             "The size of batch is invaild."));
+    PADDLE_ENFORCE_GE(
+        size, batch1_size,
+        paddle::platform::errors::Fatal("The size of batch is invaild."));
     int64_t *pdata = static_cast<int64_t *>(output[0].data.data());
     for (size_t i = 0; i < batch1_size; ++i) {
       EXPECT_EQ(pdata[i], lac_ref_data[i]);
diff --git a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
index 141e60513eb..7e754ad93bc 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
@@ -211,18 +212,15 @@ std::vector<double> Lexical_Test(
       }
     }
     // nums_infer, nums_label, nums_correct
-    auto precision =
-        acc_sum[0]
-            ? static_cast<double>(acc_sum[2]) / static_cast<double>(acc_sum[0])
-            : 0;
-    auto recall =
-        acc_sum[1]
-            ? static_cast<double>(acc_sum[2]) / static_cast<double>(acc_sum[1])
-            : 0;
-    auto f1_score =
-        acc_sum[2]
-            ? static_cast<float>(2 * precision * recall) / (precision + recall)
-            : 0;
+    auto precision = acc_sum[0] ? static_cast<double>(acc_sum[2]) /
+                                      static_cast<double>(acc_sum[0])
+                                : 0;
+    auto recall = acc_sum[1] ? static_cast<double>(acc_sum[2]) /
+                                   static_cast<double>(acc_sum[1])
+                             : 0;
+    auto f1_score = acc_sum[2] ? static_cast<float>(2 * precision * recall) /
+                                     (precision + recall)
+                               : 0;
 
     LOG(INFO) << "Precision:  " << std::fixed << std::setw(6)
               << std::setprecision(5) << precision;
diff --git a/paddle/fluid/inference/tests/api/analyzer_mmp_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mmp_tester.cc
index 4a5ec95934a..43fed05db13 100644
--- a/paddle/fluid/inference/tests/api/analyzer_mmp_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mmp_tester.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <random>
+
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
-#include <random>
-
 // Here add missing commands
 DEFINE_string(infer_model2, "", "model path");
 DEFINE_string(infer_model3, "", "model path");
@@ -96,8 +96,9 @@ void compare(bool use_mkldnn = false) {
       xx_output.begin(), xx_output.end(), xx2_output.begin(),
       [](const float& l, const float& r) { return fabs(l - r) < 1e-4; });
 
-  PADDLE_ENFORCE_EQ(result, true, paddle::platform::errors::Fatal(
-                                      "Results of model run independently "
+  PADDLE_ENFORCE_EQ(
+      result, true,
+      paddle::platform::errors::Fatal("Results of model run independently "
                                       "differs from results of the same model "
                                       "run as a sequence of models"));
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_paddle_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_paddle_tensor_tester.cc
index 2eb75c4dc53..2c02b87ba2b 100644
--- a/paddle/fluid/inference/tests/api/analyzer_paddle_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_paddle_tensor_tester.cc
@@ -16,9 +16,8 @@
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc
index 4bb59f3c8df..1618ba575a2 100644
--- a/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index 978aaf1c6a3..883d946dff5 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -47,8 +47,9 @@ struct DataRecord {
       num_lines++;
       std::vector<std::string> data;
       split(line, '\t', &data);
-      PADDLE_ENFORCE_GT(data.size(), 4, paddle::platform::errors::Fatal(
-                                            "The size of data is invaild."));
+      PADDLE_ENFORCE_GT(
+          data.size(), 4,
+          paddle::platform::errors::Fatal("The size of data is invaild."));
       // load title1 data
       std::vector<int64_t> title1_data;
       split_to_int64(data[0], ' ', &title1_data);
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
index 8f0778b83e5..1ef5e81e18a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
index 099ff1f31a7..5a78d36276c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
index 1fbcbf1a3f4..30cea4f69bd 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
index d33b11c389a..15f4b3a3a5b 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
index 0ccd95f2a17..063d29abee9 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
index 5d7f7c290f6..ef00c020973 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h b/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
index e43456ed832..a384c75e0bb 100644
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
@@ -15,6 +15,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index faa15fc4f0a..0a43d166e93 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc
index 029f2f0421d..08f26bae37b 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vit_ocr_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/analyzer_zerocopy_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_zerocopy_tensor_tester.cc
index e1ee1b196e4..d8ba615c8ed 100644
--- a/paddle/fluid/inference/tests/api/analyzer_zerocopy_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_zerocopy_tensor_tester.cc
@@ -16,9 +16,8 @@
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index b952b62f13e..6ef3eb95dd2 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <ostream>
 #include <string>
+
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
index 1d69069da07..38cf475d3da 100644
--- a/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
+++ b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <cmath>
 
 #include "gflags/gflags.h"
diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
index 5fde8e6a5e1..cbfe8229d31 100644
--- a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <cmath>
 
 #include "gflags/gflags.h"
diff --git a/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc
index d38c5c34163..a0e36e9779d 100644
--- a/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc
+++ b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc
@@ -31,8 +31,8 @@ limitations under the License. */
 DEFINE_string(infer_model, "", "Directory of the inference model.");
 
 using paddle_infer::Config;
-using paddle_infer::Predictor;
 using paddle_infer::CreatePredictor;
+using paddle_infer::Predictor;
 
 void inference(std::string model_path, bool use_ipu,
                std::vector<float> *out_data) {
diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
index 9211ea246a5..1adbf0ec7a5 100644
--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
@@ -14,11 +14,12 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <mutex>   // NOLINT
 #include <thread>  // NOLINT
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index 59bbaa2b78f..169d0b9987d 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <cmath>
 
 #include "gflags/gflags.h"
diff --git a/paddle/fluid/inference/tests/api/mkldnn_quantizer_config_tester.cc b/paddle/fluid/inference/tests/api/mkldnn_quantizer_config_tester.cc
index 4a2527a217f..d972945db7d 100644
--- a/paddle/fluid/inference/tests/api/mkldnn_quantizer_config_tester.cc
+++ b/paddle/fluid/inference/tests/api/mkldnn_quantizer_config_tester.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <iostream>
+
 #include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
@@ -90,9 +91,10 @@ TEST(Mkldnn_quantizer_config, configuration) {
 
   PADDLE_ENFORCE_EQ(
       cfg.mkldnn_quantizer_config()->scale_algo("conv2d", "Input"),
-      conv2d_scale_algo, platform::errors::InvalidArgument(
-                             "Scale algorithm got from config differs with the "
-                             "one set previously."));
+      conv2d_scale_algo,
+      platform::errors::InvalidArgument(
+          "Scale algorithm got from config differs with the "
+          "one set previously."));
 
   PADDLE_ENFORCE_EQ(
       cfg.mkldnn_quantizer_config()->scale_algo("unknown", "unknown"),
diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
index 2be69781c4e..38bcb7645ab 100644
--- a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
+
 #include <cstring>
 #include <numeric>
+
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/inference/api/paddle_infer_contrib.h"
diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_errors_tester.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_errors_tester.cc
index c5a0746c4d7..ab82c82b1e3 100644
--- a/paddle/fluid/inference/tests/api/paddle_infer_api_errors_tester.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_errors_tester.cc
@@ -15,7 +15,6 @@
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/inference/api/paddle_infer_contrib.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
index 88ebd85c79a..8cbc410eb5f 100644
--- a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
@@ -15,10 +15,11 @@ limitations under the License. */
 #include <cuda_runtime.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <cstring>
 #include <numeric>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index f2df018f497..d7784a909af 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -1081,7 +1081,7 @@ static bool CompareTensor(const framework::LoDTensor &a,
 }
 
 void ConvertFP32toFP16(paddle::PaddleTensor &tensor  // NOLINT
-                       ) {
+) {
   int num = 1;
   for (auto dim : tensor.shape) {
     num *= dim;
@@ -1101,7 +1101,7 @@ void ConvertFP32toFP16(paddle::PaddleTensor &tensor  // NOLINT
 }
 
 void ConvertFP16toFP32(paddle::PaddleTensor &tensor  // NOLINT
-                       ) {
+) {
   int num = 1;
   for (auto dim : tensor.shape) {
     num *= dim;
diff --git a/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc
index a1f31c3108b..ab059496ad8 100644
--- a/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
index 7e9f71c8b3c..b0c4c13dbbc 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc
index 209dd90c480..f269432d4da 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
index 5ae14576dfe..3ca62afba1d 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
@@ -24,8 +24,8 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
index 262b7269cb3..977c6856f8c 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
@@ -226,13 +226,78 @@ void run(paddle_infer::Predictor* predictor, std::vector<float>* out_data) {
 
   int32_t i1[run_seq_len] = {
       // sentence 1
-      1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321, 4095, 1902, 4,
-      134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2, 75, 201, 340, 9, 14, 44,
-      486, 218, 1140, 279, 12043, 2,
+      1,
+      3558,
+      4,
+      75,
+      491,
+      89,
+      340,
+      313,
+      93,
+      4,
+      255,
+      10,
+      75,
+      321,
+      4095,
+      1902,
+      4,
+      134,
+      49,
+      75,
+      311,
+      14,
+      44,
+      178,
+      543,
+      15,
+      12043,
+      2,
+      75,
+      201,
+      340,
+      9,
+      14,
+      44,
+      486,
+      218,
+      1140,
+      279,
+      12043,
+      2,
       // sentence 2
-      101, 2054, 2234, 2046, 2486, 2044, 1996, 2047, 4552, 2001, 9536, 1029,
-      102, 2004, 1997, 2008, 2154, 1010, 1996, 2047, 4552, 9536, 2075, 1996,
-      2117, 3072, 2234, 2046, 2486, 1012, 102,
+      101,
+      2054,
+      2234,
+      2046,
+      2486,
+      2044,
+      1996,
+      2047,
+      4552,
+      2001,
+      9536,
+      1029,
+      102,
+      2004,
+      1997,
+      2008,
+      2154,
+      1010,
+      1996,
+      2047,
+      4552,
+      9536,
+      2075,
+      1996,
+      2117,
+      3072,
+      2234,
+      2046,
+      2486,
+      1012,
+      102,
   };
   int32_t i2[run_seq_len] = {
       // sentence 1
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
index ccdf237ffa5..4b22bba2bcc 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
index 2d7aa72a036..a238e62fc7c 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
index c0be1944931..93d4a88383c 100644
--- a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc b/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc
index ceb8b99774e..243be1d3319 100644
--- a/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_instance_norm_converter_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
index a87bf7b085b..bcf8a23b9b9 100644
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc b/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
index ca25967b59a..3a884abe888 100644
--- a/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
@@ -14,9 +14,10 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <numeric>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_quant_int8_yolov3_r50_test.cc b/paddle/fluid/inference/tests/api/trt_quant_int8_yolov3_r50_test.cc
index 1fa24dddead..d9e1e3f8c9e 100644
--- a/paddle/fluid/inference/tests/api/trt_quant_int8_yolov3_r50_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_quant_int8_yolov3_r50_test.cc
@@ -11,9 +11,10 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <numeric>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
index 2975967e0c0..cdc6586f127 100644
--- a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_resnext_test.cc b/paddle/fluid/inference/tests/api/trt_resnext_test.cc
index b525a1b7068..374074957c8 100644
--- a/paddle/fluid/inference/tests/api/trt_resnext_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_resnext_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_split_converter_test.cc b/paddle/fluid/inference/tests/api/trt_split_converter_test.cc
index c00b36b520b..0726db28343 100644
--- a/paddle/fluid/inference/tests/api/trt_split_converter_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_split_converter_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/api/trt_test_helper.h b/paddle/fluid/inference/tests/api/trt_test_helper.h
index aaa285b2fc2..cadf996e071 100644
--- a/paddle/fluid/inference/tests/api/trt_test_helper.h
+++ b/paddle/fluid/inference/tests/api/trt_test_helper.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <dirent.h>
+
 #include <string>
 #include <vector>
 
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tests/infer_ut/test_suite.h b/paddle/fluid/inference/tests/infer_ut/test_suite.h
index a5c8c524021..8737afa8099 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_suite.h
+++ b/paddle/fluid/inference/tests/infer_ut/test_suite.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <math.h>
+
 #include <algorithm>
 #include <deque>
 #include <fstream>
@@ -26,7 +27,6 @@
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-
 #include "paddle/include/paddle_inference_api.h"
 
 namespace paddle {
@@ -64,7 +64,7 @@ void SingleThreadPrediction(paddle_infer::Predictor *predictor,
                             int repeat_times = 2) {
   // prepare input tensor
   auto input_names = predictor->GetInputNames();
-  for (const auto & [ key, value ] : *input_data_map) {
+  for (const auto &[key, value] : *input_data_map) {
     switch (value.type) {
       case paddle::PaddleDType::INT64: {
         std::vector<int64_t> input_value =
@@ -150,7 +150,7 @@ void SingleThreadPrediction(paddle_infer::Predictor *predictor,
 void CompareRecord(std::map<std::string, Record> *truth_output_data,
                    std::map<std::string, Record> *infer_output_data,
                    float epislon = 1e-5) {
-  for (const auto & [ key, value ] : *infer_output_data) {
+  for (const auto &[key, value] : *infer_output_data) {
     auto truth_record = (*truth_output_data)[key];
     VLOG(1) << "output name: " << key;
     size_t numel = value.data.size() / sizeof(float);
@@ -190,7 +190,7 @@ double SingleThreadProfile(paddle_infer::Predictor *predictor,
                            int repeat_times = 2) {
   // prepare input tensor
   auto input_names = predictor->GetInputNames();
-  for (const auto & [ key, value ] : *input_data_map) {
+  for (const auto &[key, value] : *input_data_map) {
     switch (value.type) {
       case paddle::PaddleDType::INT64: {
         std::vector<int64_t> input_value =
diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc
index 0c48c2db9b6..8f7614cb10a 100644
--- a/paddle/fluid/inference/utils/benchmark_tester.cc
+++ b/paddle/fluid/inference/utils/benchmark_tester.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/utils/benchmark.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/inference/utils/benchmark.h"
+
 using namespace paddle::inference;  // NOLINT
 TEST(Benchmark, basic) {
   Benchmark benchmark;
diff --git a/paddle/fluid/inference/utils/io_utils.cc b/paddle/fluid/inference/utils/io_utils.cc
index 87331e1978f..425c67d2fd2 100644
--- a/paddle/fluid/inference/utils/io_utils.cc
+++ b/paddle/fluid/inference/utils/io_utils.cc
@@ -158,8 +158,9 @@ void SerializePDTensorsToFile(const std::string &path,
 void DeserializePDTensorsToFile(const std::string &path,
                                 std::vector<PaddleTensor> *tensors) {
   bool is_present = analysis::FileExists(path);
-  PADDLE_ENFORCE_EQ(is_present, true, platform::errors::InvalidArgument(
-                                          "Cannot open %s to read", path));
+  PADDLE_ENFORCE_EQ(
+      is_present, true,
+      platform::errors::InvalidArgument("Cannot open %s to read", path));
   std::ifstream fin(path, std::ios::binary);
   DeserializePDTensorsToStream(fin, tensors);
   fin.close();
diff --git a/paddle/fluid/inference/utils/io_utils_tester.cc b/paddle/fluid/inference/utils/io_utils_tester.cc
index ffd97232652..e8ebb72acc3 100644
--- a/paddle/fluid/inference/utils/io_utils_tester.cc
+++ b/paddle/fluid/inference/utils/io_utils_tester.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/utils/io_utils.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <utility>
+
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h
index 6828924c300..5fccd3458a1 100644
--- a/paddle/fluid/inference/utils/singleton.h
+++ b/paddle/fluid/inference/utils/singleton.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/utils/table_printer_tester.cc b/paddle/fluid/inference/utils/table_printer_tester.cc
index 8faac79c517..fc482807b28 100644
--- a/paddle/fluid/inference/utils/table_printer_tester.cc
+++ b/paddle/fluid/inference/utils/table_printer_tester.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/utils/table_printer.h"
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/inference/utils/table_printer.h"
+
 namespace paddle {
 namespace inference {}  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 7cd5fffea2a..d72af70657a 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -28,6 +28,7 @@
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include <shared_mutex>
+
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_managed_allocator.h"
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 94b07e3e6c1..a37c11c0c04 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <memory>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
index fca07ba8e25..d3f16ec6286 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index 782062283e9..d460480bc73 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
index 4469673b305..70c43145cc8 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <condition_variable>  // NOLINT
 #include <mutex>               // NOLINT
 #include <random>
 #include <thread>  // NOLINT
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
index 8d2f6e07a29..441e80dfa4f 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <cstdlib>
-
-#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 
+#include <cstdlib>
+
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 
 DECLARE_bool(free_idle_chunk);
 DECLARE_bool(free_when_no_cache_hit);
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 4cfe3997d89..c93645bf7a0 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+
 #include <cmath>
 
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 69cb7c2708f..64ee632c387 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <stdint.h>
+
 #include <array>
 #include <list>
 #include <map>
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 62a2dd78128..de6cac63e9d 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -24,6 +24,7 @@
 #endif
 
 #include <string>
+
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
index 522b1d623e8..f3df3082741 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
index b2f24d5aed1..dff93736a6e 100644
--- a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc
@@ -15,15 +15,16 @@
 #ifndef _WIN32
 
 #include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
 
 #include <fcntl.h>
 #include <stdlib.h>
 #include <sys/mman.h>
+
 #include <random>
 #include <string>
 
 #include "glog/logging.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
index 0c83d4d3663..ac62b10c0e0 100644
--- a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
@@ -24,6 +24,7 @@
 #endif
 
 #include <string>
+
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
index a235b3871b3..9494141615f 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -18,6 +18,7 @@
 #endif
 
 #include <string>
+
 #include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
index e7b296e6a5a..ff26a96a0e1 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
@@ -16,10 +16,12 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc
index e53d7b1cc76..2cd969e2bd1 100644
--- a/paddle/fluid/memory/allocation/custom_allocator.cc
+++ b/paddle/fluid/memory/allocation/custom_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/custom_allocator.h"
+
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/memory/allocation/custom_allocator.h b/paddle/fluid/memory/allocation/custom_allocator.h
index 0f34bc156c8..b10f840f60d 100644
--- a/paddle/fluid/memory/allocation/custom_allocator.h
+++ b/paddle/fluid/memory/allocation/custom_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc
index 25c2235cce8..6fd87fb6a77 100644
--- a/paddle/fluid/memory/allocation/mmap_allocator.cc
+++ b/paddle/fluid/memory/allocation/mmap_allocator.cc
@@ -19,6 +19,7 @@
 #include <fcntl.h>
 #include <stdlib.h>
 #include <sys/mman.h>
+
 #include <random>
 #include <string>
 
@@ -217,9 +218,9 @@ std::shared_ptr<MemoryMapWriterAllocation> AllocateMemoryMapWriterAllocation(
   const std::string &ipc_name = GetIPCName();
   int flags = O_RDWR | O_CREAT;
   int fd = shm_open(ipc_name.c_str(), flags, 0600);
-  PADDLE_ENFORCE_NE(
-      fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
-                                            ipc_name.c_str()));
+  PADDLE_ENFORCE_NE(fd, -1,
+                    platform::errors::Unavailable(
+                        "File descriptor %s open failed", ipc_name.c_str()));
   PADDLE_ENFORCE_EQ(ftruncate(fd, size), 0,
                     platform::errors::Unavailable(
                         "Fruncate a file to a specified length failed!"));
@@ -239,9 +240,9 @@ std::shared_ptr<MemoryMapReaderAllocation> RebuildMemoryMapReaderAllocation(
   flags &= ~O_CREAT;
 
   int fd = shm_open(ipc_name.c_str(), flags, 0600);
-  PADDLE_ENFORCE_NE(
-      fd, -1, platform::errors::Unavailable("File descriptor %s open failed",
-                                            ipc_name.c_str()));
+  PADDLE_ENFORCE_NE(fd, -1,
+                    platform::errors::Unavailable(
+                        "File descriptor %s open failed", ipc_name.c_str()));
   void *ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
   PADDLE_ENFORCE_NE(ptr, MAP_FAILED,
                     platform::errors::Unavailable(
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 5efbfce7fed..7cc95de8310 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -24,7 +24,6 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
-
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/common/place.h"
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
index 05db0d7341a..3d6500d0f56 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <stdint.h>
+
 #include <algorithm>
 #include <mutex>  // NOLINT
 #include <unordered_map>
diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc
index d69663f636e..1c277c5db84 100644
--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/npu_allocator.h"
+
 #include <string>
+
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h
index ff55ba70c52..04832c6fd9b 100644
--- a/paddle/fluid/memory/allocation/npu_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <mutex>  // NOLINT
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 5e5aea6dab2..ad11d818752 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
+
 #include "paddle/fluid/memory/stats.h"
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index d6074975720..2914da4f636 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -44,8 +44,9 @@ void RetryAllocator::FreeImpl(phi::Allocation* allocation) {
   size_t size = allocation->size();
   underlying_allocator_->Free(allocation);
   if (UNLIKELY(waited_allocate_size_)) {
-    VLOG(10) << "Free " << size << " bytes and notify all waited threads, "
-                                   "where waited_allocate_size_ = "
+    VLOG(10) << "Free " << size
+             << " bytes and notify all waited threads, "
+                "where waited_allocate_size_ = "
              << waited_allocate_size_;
     cv_.notify_all();
   }
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index cb593f5ab74..e7370036cee 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 
 #include <thread>  // NOLINT
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 80877cb670b..81a87ef07b5 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
+
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 32d3896e66b..ac4b7c790c9 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -17,6 +17,7 @@
 #include <list>
 #include <map>
 #include <set>
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
index c5378d9f59c..74c83149b4c 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
+
 #include <condition_variable>  // NOLINT
 #include <thread>              // NOLINT
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/malloc.h"
 
diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
index c8b4e980566..07ad149a307 100644
--- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
+
 #include <mutex>
 
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
-#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/buffer.h b/paddle/fluid/memory/buffer.h
index 99b25ca289c..f42b5262e34 100644
--- a/paddle/fluid/memory/buffer.h
+++ b/paddle/fluid/memory/buffer.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <type_traits>
+
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index e1077d66c54..244445d59b8 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -168,8 +168,9 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
 }
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
   PADDLE_ENFORCE_GE(gpu_alloc_size_, size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
@@ -223,8 +224,9 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
 
 void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
   gpuError_t err;
-  PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument(
-                                  "The index should be 1, but got %d", index));
+  PADDLE_ENFORCE_EQ(index, 1,
+                    platform::errors::InvalidArgument(
+                        "The index should be 1, but got %d", index));
 
   PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size,
                     platform::errors::InvalidArgument(
@@ -310,8 +312,9 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
 
 void NPUAllocator::Free(void* p, size_t size, size_t index) {
   VLOG(4) << "Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
   PADDLE_ENFORCE_GE(npu_alloc_size_, size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
@@ -355,8 +358,9 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
 
 void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
   aclError err;
-  PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument(
-                                  "The index should be 1, but got %d", index));
+  PADDLE_ENFORCE_EQ(index, 1,
+                    platform::errors::InvalidArgument(
+                        "The index should be 1, but got %d", index));
 
   PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size,
                     platform::errors::InvalidArgument(
@@ -425,8 +429,9 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
 }
 
 void MLUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
   PADDLE_ENFORCE_GE(mlu_alloc_size_, size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
@@ -469,8 +474,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
 
 void CustomAllocator::Free(void* p, size_t size, size_t index) {
   VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
   PADDLE_ENFORCE_GE(plug_alloc_size, size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index f6ff6282a61..18c2e278f99 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stddef.h>  // for size_t
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/memory/get_base_ptr_test.cu b/paddle/fluid/memory/get_base_ptr_test.cu
index 188d2f5f420..c8928bda0c9 100644
--- a/paddle/fluid/memory/get_base_ptr_test.cu
+++ b/paddle/fluid/memory/get_base_ptr_test.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <random>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 796bdcf0ec2..a7d0fa9781f 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -24,9 +24,9 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-using phi::Allocation;
-using allocation::Allocator;
 using allocation::AllocationPtr;
+using allocation::Allocator;
+using phi::Allocation;
 
 extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                                size_t size);
diff --git a/paddle/fluid/memory/memory_stats_test.cc b/paddle/fluid/memory/memory_stats_test.cc
index b2fc602e401..081f0d3d78c 100644
--- a/paddle/fluid/memory/memory_stats_test.cc
+++ b/paddle/fluid/memory/memory_stats_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/memory.h"
 #include <algorithm>
 #include <vector>
+
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/memory.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu
index 837c964e2ad..e5958615d01 100644
--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gtest/gtest.h>
+
 #include <unordered_map>
 
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
-
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
index bb6a3cca664..a30ee161e1c 100644
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <atomic>
 #include <map>
 #include <string>
+
 #include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
@@ -149,15 +150,16 @@ void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
 #define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
   DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
 
-#define HOST_MEMORY_STAT_FUNC(item, id, func, ...)                           \
-  [&] {                                                                      \
-    PADDLE_ENFORCE_EQ(id, 0, paddle::platform::errors::OutOfRange(           \
-                                 "Only support device id 0 for host memory " \
-                                 "stats, not support device id: %d",         \
-                                 id));                                       \
-    return paddle::memory::Stat<                                             \
-               paddle::memory::HostMemoryStat##item##0>::GetInstance()       \
-        ->func(__VA_ARGS__);                                                 \
+#define HOST_MEMORY_STAT_FUNC(item, id, func, ...)                     \
+  [&] {                                                                \
+    PADDLE_ENFORCE_EQ(id, 0,                                           \
+                      paddle::platform::errors::OutOfRange(            \
+                          "Only support device id 0 for host memory "  \
+                          "stats, not support device id: %d",          \
+                          id));                                        \
+    return paddle::memory::Stat<                                       \
+               paddle::memory::HostMemoryStat##item##0>::GetInstance() \
+        ->func(__VA_ARGS__);                                           \
   }()
 
 #define HOST_MEMORY_STAT_CURRENT_VALUE(item, id) \
diff --git a/paddle/fluid/memory/stats_test.cc b/paddle/fluid/memory/stats_test.cc
index bcaba8e9108..73a6b921ca8 100644
--- a/paddle/fluid/memory/stats_test.cc
+++ b/paddle/fluid/memory/stats_test.cc
@@ -13,11 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/stats.h"
+
 #include <condition_variable>
 #include <mutex>
 #include <string>
 #include <thread>
 #include <vector>
+
 #include "gtest/gtest.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index 3bf873bcfc2..5b5350c34fb 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -25,6 +25,7 @@
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #endif
 
@@ -47,9 +48,9 @@ __global__ void add_kernel(int *x, int *y, int n) {
 void CheckMemLeak(const platform::CUDAPlace &place) {
   uint64_t cuda_malloc_size =
       platform::RecordedGpuMallocSize(place.GetDeviceId());
-  ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
-                                 << " bytes memory that not released yet,"
-                                 << " there may be a memory leak problem";
+  ASSERT_EQ(cuda_malloc_size, 0)
+      << "Found " << cuda_malloc_size << " bytes memory that not released yet,"
+      << " there may be a memory leak problem";
 }
 
 TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index b9517e1cc86..86b60da341e 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index b4a97e24cf2..b9d5e5fbe5e 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -20,8 +20,8 @@ namespace paddle {
 namespace operators {
 using framework::Tensor;
 using platform::ActivationDescriptor;
-using platform::TensorDescriptor;
 using platform::CUDADeviceContext;
+using platform::TensorDescriptor;
 
 #ifdef PADDLE_WITH_HIP
 #define GPUDNN_ACTIVATION_RELU miopenActivationRELU
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 6905f3d7954..e500992e1b5 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1454,18 +1454,19 @@ namespace plat = paddle::platform;
   REGISTER_OPERATOR(KERNEL_TYPE##_grad, ops::ActivationOpGrad,              \
                     ops::ActivationGradOpInplaceInferer);
 
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, op_name, functor,        \
-                                       grad_functor)                      \
-  REGISTER_OP_CPU_KERNEL(                                                 \
-      act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
-                                      ops::functor<float>>,               \
-      ops::ActivationKernel<paddle::platform::CPUDeviceContext,           \
-                            ops::functor<double>>);                       \
-  REGISTER_OP_CPU_KERNEL(                                                 \
-      act_type##_grad,                                                    \
-      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
-                                ops::grad_functor<float>>,                \
-      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, op_name, functor,  \
+                                       grad_functor)                \
+  REGISTER_OP_CPU_KERNEL(                                           \
+      act_type,                                                     \
+      ops::ActivationKernel<paddle::platform::CPUDeviceContext,     \
+                            ops::functor<float>>,                   \
+      ops::ActivationKernel<paddle::platform::CPUDeviceContext,     \
+                            ops::functor<double>>);                 \
+  REGISTER_OP_CPU_KERNEL(                                           \
+      act_type##_grad,                                              \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
+                                ops::grad_functor<float>>,          \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
                                 ops::grad_functor<double>>);
 
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
@@ -1781,21 +1782,18 @@ REGISTER_OP_VERSION(hard_shrink)
                 "((x < -threshold) + (x > threshold)); after checkpoint: out = "
                 "x * (((x < -threshold) + (x > threshold)) > 0)"));
 
-REGISTER_OP_VERSION(softplus)
-    .AddCheckpoint(
-        R"ROC(add new attributes [beta] and [threshold], and the formula is changed to "
+REGISTER_OP_VERSION(softplus).AddCheckpoint(
+    R"ROC(add new attributes [beta] and [threshold], and the formula is changed to "
          " softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\ \\text{For numerical"
          " stability, the implementation reverts to the linear function when: beta * x > threshold.})ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr("beta", "The beta value of the new formula", 1.0f)
-            .NewAttr("threshold", "The threshold value of the new formula",
-                     20.0f));
-
-REGISTER_OP_VERSION(mish)
-    .AddCheckpoint(
-        R"ROC(add new attributes [use_mkldnn], and when computing softplus the formula is changed as the new veriosn of softplus)ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "use_mkldnn", "(bool, default false) Only used in mkldnn kernel",
-            false));
+    paddle::framework::compatible::OpVersionDesc()
+        .NewAttr("beta", "The beta value of the new formula", 1.0f)
+        .NewAttr("threshold", "The threshold value of the new formula", 20.0f));
+
+REGISTER_OP_VERSION(mish).AddCheckpoint(
+    R"ROC(add new attributes [use_mkldnn], and when computing softplus the formula is changed as the new veriosn of softplus)ROC",
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "use_mkldnn", "(bool, default false) Only used in mkldnn kernel",
+        false));
 
 /* ========================================================================== */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 5f3916a65e7..81f5e24abfe 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -12,19 +12,20 @@ limitations under the License. */
 
 #pragma once
 #include <glog/logging.h>
+
 #include <algorithm>
+#include <cmath>
 #include <memory>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-
-#include <cmath>
 #ifndef _USE_MATH_DEFINES
 #define _USE_MATH_DEFINES
 #endif
 
 #include <type_traits>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -362,9 +363,8 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) =
-        dout *
-        ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
-            .template cast<T>();
+        dout * ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
+                   .template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() {
diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc
index e950f952c24..4127e4b1b10 100644
--- a/paddle/fluid/operators/activation_op_xpu.cc
+++ b/paddle/fluid/operators/activation_op_xpu.cc
@@ -253,8 +253,9 @@ struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
     PADDLE_ENFORCE_EQ(threshold, 6.0f,
                       platform::errors::External(
                           "Not support threshold [%f] in XPU", threshold));
-    PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External(
-                                       "Not support scale [%f] in XPU", scale));
+    PADDLE_ENFORCE_EQ(
+        scale, 6.0f,
+        platform::errors::External("Not support scale [%f] in XPU", scale));
     PADDLE_ENFORCE_EQ(
         offset, 3.0f,
         platform::errors::External("Not support offset [%f] in XPU", offset));
@@ -273,8 +274,9 @@ struct XPUHardSwishGradFunctor : public BaseActivationFunctor<T> {
     PADDLE_ENFORCE_EQ(threshold, 6.0f,
                       platform::errors::External(
                           "Not support threshold [%f] in XPU", threshold));
-    PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External(
-                                       "Not support scale [%f] in XPU", scale));
+    PADDLE_ENFORCE_EQ(
+        scale, 6.0f,
+        platform::errors::External("Not support scale [%f] in XPU", scale));
     PADDLE_ENFORCE_EQ(
         offset, 3.0f,
         platform::errors::External("Not support offset [%f] in XPU", offset));
@@ -377,10 +379,12 @@ struct XPUPowGradFunctor : public BaseActivationFunctor<T> {
     auto x_dims = phi::vectorize<int>(x->dims());
     auto dy_dims = phi::vectorize<int>(dOut->dims());
     auto dx_dims = phi::vectorize<int>(dX->dims());
-    PADDLE_ENFORCE_EQ(x_dims, dy_dims, platform::errors::PreconditionNotMet(
-                                           "x_dims should match dy_dims."));
-    PADDLE_ENFORCE_EQ(x_dims, dx_dims, platform::errors::PreconditionNotMet(
-                                           "x_dims should match dx_dims."));
+    PADDLE_ENFORCE_EQ(
+        x_dims, dy_dims,
+        platform::errors::PreconditionNotMet("x_dims should match dy_dims."));
+    PADDLE_ENFORCE_EQ(
+        x_dims, dx_dims,
+        platform::errors::PreconditionNotMet("x_dims should match dx_dims."));
     float pow_factor = ctx.Attr<float>("factor");
 
     auto xpu_context =
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
index e5fcd270eb8..4d2c23e2bb4 100644
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/add_position_encoding_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc
index 716a2e40179..d0f0a6ae0c6 100644
--- a/paddle/fluid/operators/addmm_op.cc
+++ b/paddle/fluid/operators/addmm_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
index 1b584fc5578..cd6798be2b2 100644
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index cf4041f721a..87a71130b85 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -81,13 +81,13 @@ class AffineChannelCUDAKernel : public framework::OpKernel<T> {
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     grid = std::min(std::max(max_threads / block, 1), grid);
     if (layout == framework::DataLayout::kNCHW) {
-      KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
-                          true><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_d, scale_d, bias_d, C, HxW, num, y_d);
+      KeAffineChannelCUDA<T, framework::DataLayout::kNCHW, true>
+          <<<grid, block, 0, dev_ctx.stream()>>>(x_d, scale_d, bias_d, C, HxW,
+                                                 num, y_d);
     } else {
-      KeAffineChannelCUDA<T, framework::DataLayout::kNHWC,
-                          true><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_d, scale_d, bias_d, C, HxW, num, y_d);
+      KeAffineChannelCUDA<T, framework::DataLayout::kNHWC, true>
+          <<<grid, block, 0, dev_ctx.stream()>>>(x_d, scale_d, bias_d, C, HxW,
+                                                 num, y_d);
     }
   }
 };
@@ -169,29 +169,29 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
     if (layout == framework::DataLayout::kNCHW) {
       if (dscale && dbias) {
         const T* x_d = x->data<T>();
-        AffineChannelScaleBiasGradientCUDAKernel<
-            T, block, framework::DataLayout::kNCHW><<<grid2, block, 0,
-                                                      dev_ctx.stream()>>>(
-            dy_d, x_d, N, C, HxW, ds_d, db_d);
+        AffineChannelScaleBiasGradientCUDAKernel<T, block,
+                                                 framework::DataLayout::kNCHW>
+            <<<grid2, block, 0, dev_ctx.stream()>>>(dy_d, x_d, N, C, HxW, ds_d,
+                                                    db_d);
       }
       if (dx) {
-        KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
-                            false><<<grid1, block, 0, dev_ctx.stream()>>>(
-            dy_d, s_d, nullptr, C, HxW, num, dx_d);
+        KeAffineChannelCUDA<T, framework::DataLayout::kNCHW, false>
+            <<<grid1, block, 0, dev_ctx.stream()>>>(dy_d, s_d, nullptr, C, HxW,
+                                                    num, dx_d);
       }
     } else {
       if (dscale && dbias) {
         const T* x_d = x->data<T>();
-        AffineChannelScaleBiasGradientCUDAKernel<
-            T, block, framework::DataLayout::kNHWC><<<grid2, block, 0,
-                                                      dev_ctx.stream()>>>(
-            dy_d, x_d, N, C, HxW, ds_d, db_d);
+        AffineChannelScaleBiasGradientCUDAKernel<T, block,
+                                                 framework::DataLayout::kNHWC>
+            <<<grid2, block, 0, dev_ctx.stream()>>>(dy_d, x_d, N, C, HxW, ds_d,
+                                                    db_d);
       }
 
       if (dx) {
-        KeAffineChannelCUDA<T, framework::DataLayout::kNHWC,
-                            false><<<grid1, block, 0, dev_ctx.stream()>>>(
-            dy_d, s_d, nullptr, C, HxW, num, dx_d);
+        KeAffineChannelCUDA<T, framework::DataLayout::kNHWC, false>
+            <<<grid1, block, 0, dev_ctx.stream()>>>(dy_d, s_d, nullptr, C, HxW,
+                                                    num, dx_d);
       }
     }
   }
diff --git a/paddle/fluid/operators/affine_channel_op_xpu.cc b/paddle/fluid/operators/affine_channel_op_xpu.cc
index db3eedea7ca..4de233b184a 100644
--- a/paddle/fluid/operators/affine_channel_op_xpu.cc
+++ b/paddle/fluid/operators/affine_channel_op_xpu.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
index 31801b14564..6fca4afabd9 100644
--- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
@@ -65,8 +65,9 @@ class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::dynload::cudnnSpatialTfGridGeneratorForward(
             handle, cudnn_st_desc, theta_data, output_data),
-        0, platform::errors::Fatal("Some errors has occurred "
-                                   "during forward computation in cudnn."));
+        0,
+        platform::errors::Fatal("Some errors has occurred "
+                                "during forward computation in cudnn."));
   }
 };
 
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index e311d21bb54..d7a49a965a0 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/affine_grid_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
index eeb4b3bc8a7..29a540bdc2c 100644
--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -42,8 +42,8 @@ struct Linspace<paddle::platform::CUDADeviceContext, T> {
     auto stream = ctx.cuda_device_context().stream();
     int block = 512;
     int grid = (count + block - 1) / block;
-    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, slice, count,
-                                                  number_data);
+    LinspaceKernel<T>
+        <<<grid, block, 0, stream>>>(start, slice, count, number_data);
   }
 };
 
diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h
index 21540de2b64..cbf70b9135b 100644
--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
index 68f6e3b2f3b..78bacc30161 100644
--- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <cmath>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
index 2f6977b9e2d..7771902c02b 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -143,10 +143,10 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
     int blocks_per_grid =
         (total_num + elements_per_block - 1) / elements_per_block;
     VLOG(3) << "launch kernel";
-    CheckFiniteAndUnscale<
-        T, MPDType><<<blocks_per_grid, threads_per_block,
-                      (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
-        d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs);
+    CheckFiniteAndUnscale<T, MPDType>
+        <<<blocks_per_grid, threads_per_block, (xs_size + 1) * sizeof(int64_t),
+           dev_ctx.stream()>>>(d_xs, inverse_scale_v, xs_size, d_starts,
+                               found_inf_data, d_outs);
     VLOG(3) << "finish kernel";
   }
 };
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
index 2862d923076..46572579e08 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <cstdlib>
 #include <memory>
 #include <random>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
index 30266d3eec0..1d3e5e5162c 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -65,13 +65,15 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
         int r = xpu::isfinite(dev_ctx.x_context(),
                               reinterpret_cast<const XPUTyp*>(x->data<T>()),
                               is_finite.data<bool>(), x->numel());
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(isfinite) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
-        r = xpu::logical_not(dev_ctx.x_context(), reinterpret_cast<const bool*>(
-                                                      is_finite.data<bool>()),
-                             is_finite.data<bool>(), x->numel());
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(isfinite) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
+        r = xpu::logical_not(
+            dev_ctx.x_context(),
+            reinterpret_cast<const bool*>(is_finite.data<bool>()),
+            is_finite.data<bool>(), x->numel());
         PADDLE_ENFORCE_EQ(
             r, XPU_SUCCESS,
             platform::errors::External("XPU API(logical_not) return wrong "
@@ -79,10 +81,11 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
                                        r, XPUAPIErrorMsg[r]));
         r = xpu::any(dev_ctx.x_context(), is_finite.data<bool>(),
                      found_inf_data, x->numel());
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(any) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(any) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
         if (dev_ctx.x_context()->xpu_stream) {
           dev_ctx.Wait();
         }
@@ -106,36 +109,40 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
         int r = xpu::cast_v2(dev_ctx.x_context(),
                              reinterpret_cast<const float16*>(x->data<T>()),
                              float_x.data<MPDType>(), x->numel());
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(cast_v2) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(cast_v2) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
 
         r = xpu::scale(dev_ctx.x_context(), float_x.data<MPDType>(),
                        float_out.data<MPDType>(), x->numel(), false,
                        inverse_scale, 0.0);
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(scale) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(scale) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
 
         r = xpu::cast_v2(dev_ctx.x_context(), float_out.data<MPDType>(),
                          reinterpret_cast<float16*>(out->data<T>()),
                          out->numel());
 
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(cast_v2) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(cast_v2) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
       } else {
         int r = xpu::scale(dev_ctx.x_context(),
                            reinterpret_cast<const XPUTyp*>(x->data<T>()),
                            reinterpret_cast<XPUTyp*>(out->data<T>()),
                            x->numel(), false, inverse_scale, 0.0);
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(scale) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(scale) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
       }
     }
     if (dev_ctx.x_context()->xpu_stream) {
diff --git a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
index e5a2d93e32f..c102bd2ba47 100644
--- a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <cmath>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/amp/get_float_status_op_npu.cc b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
index 8109a1ff43f..0c118761650 100644
--- a/paddle/fluid/operators/amp/get_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <cmath>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index 8354650df02..baf742b0b40 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+
 #include <cstring>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
index 43f8f84578c..81f98643441 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h
index 41eb94247f5..f4c6b6f1f7d 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.h
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h
@@ -19,6 +19,7 @@
 #endif  // PADDLE_WITH_CUDA && __NVCC__
 #include <cmath>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index f9a93a47ff2..da7e23c4620 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
 #include <cmath>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 DECLARE_int32(min_loss_scaling);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
index fe03d93f448..8f57e00fe11 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
 #include <cstring>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -59,10 +60,11 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
         r = xpu::constant(dev_ctx.x_context(),
                           reinterpret_cast<XPUTyp*>(out_data), num,
                           XPUTyp(0.0));
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(constant) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_EQ(
+            r, XPU_SUCCESS,
+            platform::errors::External("XPU API(constant) return wrong "
+                                       "value[%d %s]",
+                                       r, XPUAPIErrorMsg[r]));
       }
     }
     const bool stop_update = ctx.Attr<bool>("stop_update");
diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h
index 116a8053db3..ace345465dc 100644
--- a/paddle/fluid/operators/angle_op.h
+++ b/paddle/fluid/operators/angle_op.h
@@ -17,11 +17,11 @@
 #define _USE_MATH_DEFINES
 #endif
 #include <cmath>
-#include "paddle/phi/kernels/funcs/complex_functors.h"
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index c5e4188ca2d..63fd27a1edf 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
@@ -28,20 +27,18 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ArgMaxInferShapeFunctor);
 
-REGISTER_OP_VERSION(arg_max)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(arg_max).AddCheckpoint(
+    R"ROC(
               Upgrade argmax add a new attribute [flatten] and modify the attribute of dtype)ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr("flatten",
-                     "In order to compute the argmax over the flattened array "
-                     "when the "
-                     "argument `axis` in python API is None.",
-                     false)
-            .ModifyAttr(
-                "dtype",
-                "Change the default value of dtype from -1 to 3"
-                ", means return the int64 indices directly. The rearse why "
-                "changing the default value is that the int64 value in "
-                "VarType is 3 in the frameworke.proto.",
-                3));
+    paddle::framework::compatible::OpVersionDesc()
+        .NewAttr("flatten",
+                 "In order to compute the argmax over the flattened array "
+                 "when the "
+                 "argument `axis` in python API is None.",
+                 false)
+        .ModifyAttr("dtype",
+                    "Change the default value of dtype from -1 to 3"
+                    ", means return the int64 indices directly. The rearse why "
+                    "changing the default value is that the int64 value in "
+                    "VarType is 3 in the frameworke.proto.",
+                    3));
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index 585341beea1..194a3070bf6 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <type_traits>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index fb3abd01af8..c995d56cf6b 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -27,20 +27,18 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ArgMinInferShapeFunctor);
 
-REGISTER_OP_VERSION(arg_min)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(arg_min).AddCheckpoint(
+    R"ROC(
               Upgrade argmin add a new attribute [flatten] and modify the attribute of dtype)ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr("flatten",
-                     "In order to compute the argmin over the flattened array "
-                     "when the "
-                     "argument `axis` in python API is None.",
-                     false)
-            .ModifyAttr(
-                "dtype",
-                "Change the default value of dtype from -1 to 3"
-                ", means return the int64 indices directly. The rearse why "
-                "changing the default value is that the int64 value in "
-                "VarType is 3 in the frameworke.proto.",
-                3));
+    paddle::framework::compatible::OpVersionDesc()
+        .NewAttr("flatten",
+                 "In order to compute the argmin over the flattened array "
+                 "when the "
+                 "argument `axis` in python API is None.",
+                 false)
+        .ModifyAttr("dtype",
+                    "Change the default value of dtype from -1 to 3"
+                    ", means return the int64 indices directly. The rearse why "
+                    "changing the default value is that the int64 value in "
+                    "VarType is 3 in the frameworke.proto.",
+                    3));
diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
index af44a77c813..0cc3b695aef 100644
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 1db3592b1cf..f0824695a06 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <paddle/fluid/operators/math/concat_and_split.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/lod_utils.h"
diff --git a/paddle/fluid/operators/ascend_trigger_op.h b/paddle/fluid/operators/ascend_trigger_op.h
index eaa79da2ba8..d1eaa00c2a3 100644
--- a/paddle/fluid/operators/ascend_trigger_op.h
+++ b/paddle/fluid/operators/ascend_trigger_op.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_ASCEND
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
diff --git a/paddle/fluid/operators/assign_op_xpu.cc b/paddle/fluid/operators/assign_op_xpu.cc
index b95be3096f0..7d03982f6ad 100644
--- a/paddle/fluid/operators/assign_op_xpu.cc
+++ b/paddle/fluid/operators/assign_op_xpu.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/assign_op.h"
-
 #include <string>
 
+#include "paddle/fluid/operators/assign_op.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index bf7d609370a..22db7d9e982 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
+
 #include <string>
+
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/cpu_vec.h"
@@ -62,8 +64,9 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
           "LSTMWeight dims should be (%d + %d) * %d.", D, M, 4 * D));
 
   auto b_dims = ctx->GetInputDim("LSTMBias");
-  PADDLE_ENFORCE_EQ(b_dims.size(), 2, platform::errors::InvalidArgument(
-                                          "Input(LSTMBias)'s rank must be 2."));
+  PADDLE_ENFORCE_EQ(
+      b_dims.size(), 2,
+      platform::errors::InvalidArgument("Input(LSTMBias)'s rank must be 2."));
   PADDLE_ENFORCE_EQ(b_dims[0], 1,
                     platform::errors::InvalidArgument(
                         "LSTMBias dims should be 1 x %d.", 4 * D));
@@ -72,11 +75,13 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                         "LSTMBias dims should be 1 x %d.", 4 * D));
 
   auto c_dims = ctx->GetInputDim("C0");
-  PADDLE_ENFORCE_EQ(c_dims.size(), 2, platform::errors::InvalidArgument(
-                                          "Input(C0)'s rank must be 2."));
+  PADDLE_ENFORCE_EQ(
+      c_dims.size(), 2,
+      platform::errors::InvalidArgument("Input(C0)'s rank must be 2."));
   if (ctx->IsRuntime()) {
-    PADDLE_ENFORCE_EQ(c_dims[1], D, platform::errors::InvalidArgument(
-                                        "C0 dims should be N x %d.", D));
+    PADDLE_ENFORCE_EQ(
+        c_dims[1], D,
+        platform::errors::InvalidArgument("C0 dims should be N x %d.", D));
   }
 
   if (ctx->HasInput("H0")) {
@@ -126,10 +131,12 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(dims.size(), 2,
                       platform::errors::InvalidArgument(
                           "Input(AttentionScalar)'s rank must be 2."));
-    PADDLE_ENFORCE_EQ(dims[0], 1, platform::errors::InvalidArgument(
-                                      "AttentionScalar shapes must be 1 * 1."));
-    PADDLE_ENFORCE_EQ(dims[1], 1, platform::errors::InvalidArgument(
-                                      "AttentionScalar shapes must be 1 * 1."));
+    PADDLE_ENFORCE_EQ(dims[0], 1,
+                      platform::errors::InvalidArgument(
+                          "AttentionScalar shapes must be 1 * 1."));
+    PADDLE_ENFORCE_EQ(dims[1], 1,
+                      platform::errors::InvalidArgument(
+                          "AttentionScalar shapes must be 1 * 1."));
   }
 
   if (ctx->HasInput("AttentionScalarBias")) {
@@ -332,14 +339,15 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
       int len = x_lod[0][i + 1] - x_lod[0][i];
       max_seq_len = max_seq_len < len ? len : max_seq_len;
     }
-    PADDLE_ENFORCE_EQ(x_lod.size(), 1UL, platform::errors::InvalidArgument(
-                                             "Input(X)'s lod size must be 1."));
+    PADDLE_ENFORCE_EQ(
+        x_lod.size(), 1UL,
+        platform::errors::InvalidArgument("Input(X)'s lod size must be 1."));
     PADDLE_ENFORCE_EQ(
         c0->dims()[0], N,
         platform::errors::InvalidArgument("C0 dims should be %d x %d.", N, D));
     fc_out->Resize({max_seq_len, 1});
 
-    std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
+    std::function<void(const int, const T*, T*)> act_gate, act_cell, act_cand;
     auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
     auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
     auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 289dda56b19..de6eca3903f 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/batch_fc_op.cc b/paddle/fluid/operators/batch_fc_op.cc
index 952625bcb6e..2d2deae69a7 100644
--- a/paddle/fluid/operators/batch_fc_op.cc
+++ b/paddle/fluid/operators/batch_fc_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/batch_fc_op.h"
+
 #include <string>
 
 namespace paddle {
@@ -42,8 +43,9 @@ class BatchFCOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(input_dims.size(), 3,
                       platform::errors::InvalidArgument(
                           "Input of BatchFCOp should have 3D."));
-    PADDLE_ENFORCE_EQ(w_dims.size(), 3, platform::errors::InvalidArgument(
-                                            "W of BatchFCOp should have 3D."));
+    PADDLE_ENFORCE_EQ(
+        w_dims.size(), 3,
+        platform::errors::InvalidArgument("W of BatchFCOp should have 3D."));
     PADDLE_ENFORCE_EQ(
         input_dims[0], w_dims[0],
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu
index ddedf0172be..5843acb4fdd 100644
--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/batch_fc_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 2663a081011..67384338d76 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/batch_norm_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/data_layout.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -167,10 +169,11 @@ framework::OpKernelType BatchNormOp::GetExpectedKernelType(
       bn_param_type,
       framework::TransToProtoVarType(ctx.Input<Tensor>("Mean")->dtype()),
       platform::errors::InvalidArgument("Mean input should be of float type"));
-  PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                       ctx.Input<Tensor>("Variance")->dtype()),
-                    platform::errors::InvalidArgument(
-                        "Variance input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(ctx.Input<Tensor>("Variance")->dtype()),
+      platform::errors::InvalidArgument(
+          "Variance input should be of float type"));
 
   // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
   framework::LibraryType library = framework::LibraryType::kPlain;
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index d274e8d2c00..b82b49e5cd5 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/layout_utils.h"
diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc
index 6507890a8b5..6dff315aa6a 100644
--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc
index ae03ecbcb16..725b7f3848f 100644
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
@@ -113,8 +113,9 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
       runner_reduce.Run(stream);
 
       const auto &runner_update = NpuOpRunner(
-          "BNTrainingUpdate", {x_tensor, sum, square_sum, *scale, *bias,
-                               *running_mean, *running_var},
+          "BNTrainingUpdate",
+          {x_tensor, sum, square_sum, *scale, *bias, *running_mean,
+           *running_var},
           {y_tesnor, *mean_out, *variance_out, *saved_mean, *saved_variance},
           {{"factor", momentum}, {"epsilon", epsilon}});
       runner_update.Run(stream);
@@ -216,10 +217,11 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
                         {dx_tensor}, {{"epsilon", epsilon}});
         runner_infer.Run(stream);
       } else {
-        const auto &runner_reduce = NpuOpRunner(
-            "BNTrainingReduceGrad", {dy_tensor, x_tensor, *d_scale, *d_bias,
-                                     *scale, *saved_mean, *saved_inv_variance},
-            {dx_tensor}, {{"epsilon", epsilon}});
+        const auto &runner_reduce =
+            NpuOpRunner("BNTrainingReduceGrad",
+                        {dy_tensor, x_tensor, *d_scale, *d_bias, *scale,
+                         *saved_mean, *saved_inv_variance},
+                        {dx_tensor}, {{"epsilon", epsilon}});
         runner_reduce.Run(stream);
       }
     }
diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc
index 0893324c602..3ade2f36ad8 100644
--- a/paddle/fluid/operators/batch_norm_op_xpu.cc
+++ b/paddle/fluid/operators/batch_norm_op_xpu.cc
@@ -13,10 +13,11 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/batch_norm_op.h"
 #include <iterator>
 #include <vector>
 
+#include "paddle/fluid/operators/batch_norm_op.h"
+
 namespace paddle {
 namespace operators {
 
@@ -128,8 +129,9 @@ static int calculate_inv_BN_Y(xpu::Context *ctx, T *x, const T *scale,
                               const T *bias, const T *mean, const T *variance,
                               const int N, const int C, const int M,
                               const T *y) {
-  PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
-                              "X and Y should be inplaced in inplace mode"));
+  PADDLE_ENFORCE_EQ(x, y,
+                    platform::errors::InvalidArgument(
+                        "X and Y should be inplaced in inplace mode"));
   std::vector<int> tensor_shape_vec({N, C, M});
   std::vector<int> array_shape_vec({1, C, 1});
   // y - bias
@@ -207,8 +209,9 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
       is_inplace = false;
       if (d_x) {
         PADDLE_ENFORCE_NE(
-            d_x, d_y, platform::errors::InvalidArgument(
-                          "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+            d_x, d_y,
+            platform::errors::InvalidArgument(
+                "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
       }
     }
 
@@ -275,11 +278,12 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
         int r1 =
             calculate_inv_var(dev_ctx.x_context(), global_var->data<float>(),
                               epsilon, C, epsilon_data, global_inv_std_data);
-        PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External(
-                                               "XPU API(batch_norm_grad "
-                                               "calculate_inv_var function) "
-                                               "return wrong value[%d %s]",
-                                               r1, XPUAPIErrorMsg[r1]));
+        PADDLE_ENFORCE_EQ(
+            r1, XPU_SUCCESS,
+            platform::errors::External("XPU API(batch_norm_grad "
+                                       "calculate_inv_var function) "
+                                       "return wrong value[%d %s]",
+                                       r1, XPUAPIErrorMsg[r1]));
       }
       auto px = *x;
       auto *inv_std_data =
@@ -290,11 +294,12 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
           dev_ctx.x_context(), px.mutable_data<T>(ctx.GetPlace()),
           scale->data<float>(), bias->data<float>(), mean_data, inv_std_data, N,
           C, H * W, x->data<T>());
-      PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, platform::errors::External(
-                                             "XPU API(batch_norm_grad "
-                                             "calculate_inv_BN_Y function) "
-                                             "return wrong value[%d %s]",
-                                             r2, XPUAPIErrorMsg[r2]));
+      PADDLE_ENFORCE_EQ(
+          r2, XPU_SUCCESS,
+          platform::errors::External("XPU API(batch_norm_grad "
+                                     "calculate_inv_BN_Y function) "
+                                     "return wrong value[%d %s]",
+                                     r2, XPUAPIErrorMsg[r2]));
     }
 
     int r3;
@@ -319,10 +324,11 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
           scale_data, batch_mean->data<float>(), batch_inv_std->data<float>(),
           d_scale_data, d_bias_data, is_nchw);
     }
-    PADDLE_ENFORCE_EQ(r3, XPU_SUCCESS, platform::errors::External(
-                                           "XPU API(batch_norm_grad) return "
-                                           "wrong value[%d %s]",
-                                           r3, XPUAPIErrorMsg[r3]));
+    PADDLE_ENFORCE_EQ(
+        r3, XPU_SUCCESS,
+        platform::errors::External("XPU API(batch_norm_grad) return "
+                                   "wrong value[%d %s]",
+                                   r3, XPUAPIErrorMsg[r3]));
   }
 };
 
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index facb4cd8254..1cc6e364677 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 3fae65c5017..0e3e32666a8 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/beam_search_decode_op.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/beam_search_decode_op.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
index cf32e407424..6f70136b2d2 100644
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -103,11 +103,9 @@ TEST(BeamSearchDecodeOp, Backtrace) {
                                 std::vector<int>{1, 1, 3, 5}, &ids, &scores);
   paddle::test::GenerateExample(
       std::vector<size_t>{0, 2, 4},
-      std::vector<size_t>{0, 0, 0, 2,
-                          2},  // the branchs of the first source sentence
-                               // are pruned since finished
-      std::vector<int>{5, 1},
-      &ids, &scores);
+      std::vector<size_t>{0, 0, 0, 2, 2},  // the branchs of the first source
+                                           // sentence are pruned since finished
+      std::vector<int>{5, 1}, &ids, &scores);
 
   ASSERT_EQ(ids.size(), 5UL);
   ASSERT_EQ(scores.size(), 5UL);
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 887d28f5875..90b6359f447 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc
index 4ef9476eee5..15aca070221 100644
--- a/paddle/fluid/operators/beam_search_op.cu.cc
+++ b/paddle/fluid/operators/beam_search_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/beam_search_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/beam_search_op_npu.cc b/paddle/fluid/operators/beam_search_op_npu.cc
index cae3d0e55fc..f5fa0ac026d 100644
--- a/paddle/fluid/operators/beam_search_op_npu.cc
+++ b/paddle/fluid/operators/beam_search_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/beam_search_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/beam_search_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_NPU_KERNEL(
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
index 4b1593b1f8b..fc01eef8058 100644
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/benchmark/op_tester.h"
+
 #include <fstream>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_info.h"
diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h
index 6acd42c8675..217fbe2653e 100644
--- a/paddle/fluid/operators/benchmark/op_tester.h
+++ b/paddle/fluid/operators/benchmark/op_tester.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/benchmark/op_tester_config.h"
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc
index e9477798858..d7a055ede1b 100644
--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/benchmark/op_tester_config.h"
+
 #include <fstream>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bilateral_slice_op.cc b/paddle/fluid/operators/bilateral_slice_op.cc
index 675566504c2..124441093d3 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cc
+++ b/paddle/fluid/operators/bilateral_slice_op.cc
@@ -10,9 +10,11 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/bilateral_slice_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu
index e7bf6d212dc..f20debdf0b8 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cu
+++ b/paddle/fluid/operators/bilateral_slice_op.cu
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <string>
+
 #include "paddle/fluid/operators/bilateral_slice_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -167,11 +168,11 @@ class BilateralSliceOpCUDAKernel : public framework::OpKernel<T> {
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), total_count);
 
-    BilateralSliceCudaForwardKernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        output_data, grid_data, guide_data, input_data, grid_sizes, has_offset,
-        total_count, output_dims[1]);
+    BilateralSliceCudaForwardKernel<T>
+        <<<config.block_per_grid, config.thread_per_block, 0,
+           ctx.cuda_device_context().stream()>>>(
+            output_data, grid_data, guide_data, input_data, grid_sizes,
+            has_offset, total_count, output_dims[1]);
   }
 };
 
@@ -475,29 +476,29 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), grid_count);
 
-    BilateralSliceCudaGridGradKernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes,
-        has_offset, grid_count, output_chans);
+    BilateralSliceCudaGridGradKernel<T>
+        <<<config.block_per_grid, config.thread_per_block, 0,
+           ctx.cuda_device_context().stream()>>>(
+            grid_grad_data, output_grad_data, guide_data, input_data,
+            grid_sizes, has_offset, grid_count, output_chans);
 
     config =
         platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), guide_count);
 
-    BilateralSliceCudaGuideGradKernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        guide_grad_data, output_grad_data, grid_data, guide_data, input_data,
-        grid_sizes, has_offset, guide_count, output_chans);
+    BilateralSliceCudaGuideGradKernel<T>
+        <<<config.block_per_grid, config.thread_per_block, 0,
+           ctx.cuda_device_context().stream()>>>(
+            guide_grad_data, output_grad_data, grid_data, guide_data,
+            input_data, grid_sizes, has_offset, guide_count, output_chans);
 
     config =
         platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_count);
 
-    BilateralSliceCudaInputGradKernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        input_grad_data, output_grad_data, grid_data, guide_data, grid_sizes,
-        has_offset, input_count, output_chans);
+    BilateralSliceCudaInputGradKernel<T>
+        <<<config.block_per_grid, config.thread_per_block, 0,
+           ctx.cuda_device_context().stream()>>>(
+            input_grad_data, output_grad_data, grid_data, guide_data,
+            grid_sizes, has_offset, input_count, output_chans);
   }
 };
 
diff --git a/paddle/fluid/operators/bilateral_slice_op.h b/paddle/fluid/operators/bilateral_slice_op.h
index a388f4763ec..66783f151ea 100644
--- a/paddle/fluid/operators/bilateral_slice_op.h
+++ b/paddle/fluid/operators/bilateral_slice_op.h
@@ -12,6 +12,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/hostdevice.h"
 
diff --git a/paddle/fluid/operators/bmm_op.cc b/paddle/fluid/operators/bmm_op.cc
index 6b5f4755d77..16066c1a13e 100644
--- a/paddle/fluid/operators/bmm_op.cc
+++ b/paddle/fluid/operators/bmm_op.cc
@@ -13,6 +13,7 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/bmm_op.h"
+
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bmm_op.h b/paddle/fluid/operators/bmm_op.h
index 3fecb55caae..271a74a4444 100644
--- a/paddle/fluid/operators/bmm_op.h
+++ b/paddle/fluid/operators/bmm_op.h
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/bmm_op_xpu.cc b/paddle/fluid/operators/bmm_op_xpu.cc
index cc185580279..348f25d46b4 100644
--- a/paddle/fluid/operators/bmm_op_xpu.cc
+++ b/paddle/fluid/operators/bmm_op_xpu.cc
@@ -16,8 +16,8 @@
 
 #include <string>
 #include <vector>
-#include "paddle/fluid/operators/matmul_v2_op.h"
 
+#include "paddle/fluid/operators/matmul_v2_op.h"
 #include "paddle/fluid/operators/xpu_api_wrapper.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
index bbe4bb08adf..afa7aee4450 100644
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/bpr_loss_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
index 993bc0fccf0..fd6df2c1594 100644
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -61,8 +61,9 @@ class BprLossOpKernel : public framework::OpKernel<T> {
     const int64_t* label_data = labels->data<int64_t>();
     for (int i = 0; i < step_size; ++i) {
       int lbl_pos = label_data[i];
-      PADDLE_ENFORCE_GE(lbl_pos, 0, platform::errors::InvalidArgument(
-                                        "label data %d is illegal.", lbl_pos));
+      PADDLE_ENFORCE_GE(lbl_pos, 0,
+                        platform::errors::InvalidArgument(
+                            "label data %d is illegal.", lbl_pos));
       PADDLE_ENFORCE_LT(lbl_pos, class_num,
                         platform::errors::InvalidArgument(
                             "label data %d is illegal.", lbl_pos));
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
index 1063a8b7992..53146417f21 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using framework::Tensor;
 using framework::DDim;
+using framework::Tensor;
 
 class BroadcastTensorsOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 76e0f23df21..f0146994c1f 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cast_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 034cb47fab1..2f222d23e7c 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
-
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 
diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc
index 64324d9772b..8551d799cc3 100644
--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@@ -19,9 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/platform/float16.h"
-#include "xpu/refactor/math.h"
-
 #include "paddle/phi/kernels/cast_kernel.h"
+#include "xpu/refactor/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/center_loss_op.cc b/paddle/fluid/operators/center_loss_op.cc
index cd1aa9d9c84..add0bf966d9 100644
--- a/paddle/fluid/operators/center_loss_op.cc
+++ b/paddle/fluid/operators/center_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/center_loss_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu
index 549bb5ae75a..b46feeae64b 100644
--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <iostream>
+
 #include "paddle/fluid/operators/center_loss_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h
index ed266e9ac7d..18769fed37b 100644
--- a/paddle/fluid/operators/center_loss_op.h
+++ b/paddle/fluid/operators/center_loss_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstring>
 #include <limits>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
index dfb0ad96b0b..83bdaa2de7d 100644
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/chunk_eval_op.h"
+
 #include <string>
 #include <vector>
 
@@ -55,11 +56,12 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           (inference_dim.size() == 3 && inference_dim[2] == 1) ||
               inference_dim.size() == 2,
-          true, platform::errors::InvalidArgument(
-                    "when Input(SeqLength) is provided, Input(Inference) "
-                    "should be of dim 3 (batch_size, bucket, 1) or dim 2 "
-                    "(batch_size, bucket), but received [%s].",
-                    inference_dim));
+          true,
+          platform::errors::InvalidArgument(
+              "when Input(SeqLength) is provided, Input(Inference) "
+              "should be of dim 3 (batch_size, bucket, 1) or dim 2 "
+              "(batch_size, bucket), but received [%s].",
+              inference_dim));
       auto seq_length_dim = ctx->GetInputDim("SeqLength");
       PADDLE_ENFORCE_LE(seq_length_dim.size(), 2,
                         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
index 0903c53e5ec..be9829dd43b 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/cinn/cinn_instruction_run_op.h"
+
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -48,12 +49,12 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel {
 
  protected:
   /* [Why use single type kernel]:
-  *
-  * Whether the kernel data type is int, float or other type,
-  * which has no effect on its execution logic, so directly
-  * specified a data type here.
-  *
-  */
+   *
+   * Whether the kernel data type is int, float or other type,
+   * which has no effect on its execution logic, so directly
+   * specified a data type here.
+   *
+   */
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(framework::proto::VarType::FP32,
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
index ea72f6c5374..afa350ef116 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_instruction_run_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
index 81c2d23d3f1..13483d78f49 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
@@ -18,6 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "cinn/hlir/framework/graph_compiler.h"
 #include "cinn/hlir/framework/instruction.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
index 68bc3a0eb5c..cbfab3090c0 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdlib.h>
+
 #include <string>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index a660d59fb4c..6b70efee86f 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
+
 #include <algorithm>
 #include <functional>
 #include <utility>
 #include <vector>
+
 #include "cinn/hlir/framework/graph_compiler.h"
 #include "cinn/hlir/framework/instruction.h"
 #include "cinn/hlir/framework/scope.h"
@@ -43,13 +45,13 @@
 namespace paddle {
 namespace operators::details {
 
-using framework::Scope;
 using framework::LoDTensor;
 using framework::ParallelExecutor;
+using framework::Scope;
 using CinnInstruction = ::cinn::hlir::framework::Instruction;
 using CinnRuntimeProgram = ::cinn::hlir::framework::Program;
-using framework::paddle2cinn::Name2VarInfoMap;
 using framework::paddle2cinn::kMemOptVarInfoFromMainGraph;
+using framework::paddle2cinn::Name2VarInfoMap;
 
 CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
                                      const CinnCompiledObject& compiled_obj)
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index ed5e4383d83..0bbbcc8b031 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
index ecbfbf2f92e..cd4465d355f 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
+
 #include <memory>
 #include <set>
 #include <utility>
+
 #include "cinn/auto_schedule/auto_tuner.h"
 #include "cinn/common/target.h"
 #include "cinn/common/type.h"
@@ -38,11 +40,11 @@ USE_OP(cinn_instruction_run);
 namespace paddle {
 namespace operators::details {
 
+using framework::LoDTensor;
 using framework::OpDesc;
+using framework::ParallelExecutor;
 using framework::ProgramDesc;
-using framework::LoDTensor;
 using framework::ir::Graph;
-using framework::ParallelExecutor;
 using framework::paddle2cinn::Name2VarInfoMap;
 using CinnShape = ::cinn::hlir::framework::Shape;
 using CinnInstruction = ::cinn::hlir::framework::Instruction;
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc
index 0a9b66bc92c..3b0198613db 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/cinn/cinn_launch_op.h"
+
 #include <functional>
 #include <vector>
+
 #include "cinn/hlir/framework/graph_compiler.h"
 #include "cinn/runtime/cinn_runtime.h"
 #include "cinn/runtime/flags.h"
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
index 9dfd53834e9..fb5a48ca3d0 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_launch_op.h"
+
 #include "paddle/fluid/framework/operator.h"
 
 /* see [Why use single type kernel] */
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index f40b788dfb5..62c79faafec 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -77,16 +77,16 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     std::map<std::string, const LoDTensor*> inputs_name2tensor;
     std::vector<std::string> input_x_variable_names;
     std::vector<std::string> input_no_need_buffer_variable_names;
-    auto add_name2tensor_fn = [&inputs_name2tensor](
-        const std::vector<std::string>& variable_names,
-        const std::vector<const LoDTensor*>& tensors) {
-      std::transform(
-          variable_names.begin(), variable_names.end(), tensors.begin(),
-          std::inserter(inputs_name2tensor, inputs_name2tensor.end()),
-          [](const std::string& name, const LoDTensor* tensor) {
-            return std::make_pair(name, tensor);
-          });
-    };
+    auto add_name2tensor_fn =
+        [&inputs_name2tensor](const std::vector<std::string>& variable_names,
+                              const std::vector<const LoDTensor*>& tensors) {
+          std::transform(
+              variable_names.begin(), variable_names.end(), tensors.begin(),
+              std::inserter(inputs_name2tensor, inputs_name2tensor.end()),
+              [](const std::string& name, const LoDTensor* tensor) {
+                return std::make_pair(name, tensor);
+              });
+        };
 
     auto input_x_tensors = ctx.MultiInput<LoDTensor>(kX);
     if (!input_x_tensors.empty()) {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index b0bd043f432..9ed9fad36a3 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cinn/cinn_launch_op.h"
+
 #include <stdlib.h>
+
 #include <mutex>
 #include <random>
 #include <string>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.cc b/paddle/fluid/operators/cinn/cinn_op_helper.cc
index 3fb9c822c77..26fee2d9e57 100644
--- a/paddle/fluid/operators/cinn/cinn_op_helper.cc
+++ b/paddle/fluid/operators/cinn/cinn_op_helper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device_context.h"
 
diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.h b/paddle/fluid/operators/cinn/cinn_op_helper.h
index e542134b946..55ee3789c0a 100644
--- a/paddle/fluid/operators/cinn/cinn_op_helper.h
+++ b/paddle/fluid/operators/cinn/cinn_op_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/operator.h"
 
 // We define some common names or utility functions
diff --git a/paddle/fluid/operators/cinn/test_helper.h b/paddle/fluid/operators/cinn/test_helper.h
index 9720a5309fa..4e06882279b 100644
--- a/paddle/fluid/operators/cinn/test_helper.h
+++ b/paddle/fluid/operators/cinn/test_helper.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <random>
 #include <string>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index a23cf2815d8..7192b415c27 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -15,17 +15,20 @@
 #ifdef PADDLE_WITH_HIP
 #include <hiprand.h>
 #include <hiprand_kernel.h>
+
 #include <hipcub/hipcub.hpp>
 typedef hiprandState curandState;
 namespace cub = hipcub;
 #else
 #include <curand.h>
 #include <curand_kernel.h>
+
 #include <cub/cub.cuh>
 #endif
 
 #include <iterator>
 #include <random>
+
 #include "paddle/fluid/operators/class_center_sample_op.h"
 #include "paddle/phi/api/include/tensor.h"
 
diff --git a/paddle/fluid/operators/class_center_sample_op.h b/paddle/fluid/operators/class_center_sample_op.h
index 24ce9ace3bf..8f12e90e185 100644
--- a/paddle/fluid/operators/class_center_sample_op.h
+++ b/paddle/fluid/operators/class_center_sample_op.h
@@ -16,6 +16,7 @@
 #include <map>
 #include <set>
 #include <vector>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 8822fffd326..379cd4c6653 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -114,10 +114,11 @@ class ClipByNormOp : public framework::OperatorWithKernel {
                           "Output(Out) of ClipByNormOp should not be null. "
                           "Please check if it is created correctly."));
     auto max_norm = ctx->Attrs().Get<float>("max_norm");
-    PADDLE_ENFORCE_GT(max_norm, 0, platform::errors::InvalidArgument(
-                                       "max_norm should be greater than 0. "
-                                       "Received max_norm is %f.",
-                                       max_norm));
+    PADDLE_ENFORCE_GT(
+        max_norm, 0,
+        platform::errors::InvalidArgument("max_norm should be greater than 0. "
+                                          "Received max_norm is %f.",
+                                          max_norm));
     auto x_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/clip_by_norm_op_xpu.cc b/paddle/fluid/operators/clip_by_norm_op_xpu.cc
index 7c91f06a8d7..62c2608f11c 100644
--- a/paddle/fluid/operators/clip_by_norm_op_xpu.cc
+++ b/paddle/fluid/operators/clip_by_norm_op_xpu.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/clip_by_norm_op.h"
 #include <vector>
 
+#include "paddle/fluid/operators/clip_by_norm_op.h"
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index 6e898d31663..46eb9448d9d 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -179,14 +180,13 @@ REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad, ops::ClipGradInplaceInferer,
                   ops::ClipDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::ClipDoubleGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_VERSION(clip)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(clip).AddCheckpoint(
+    R"ROC(
               Upgrade clip add a new input [Min])ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewInput("Min",
-                      "Pass the mix, min value as input, not attribute. Min is "
-                      "dispensable.")
-            .NewInput("Max",
-                      "Pass the mix, min value as input, not attribute. Max is "
-                      "dispensable."));
+    paddle::framework::compatible::OpVersionDesc()
+        .NewInput("Min",
+                  "Pass the mix, min value as input, not attribute. Min is "
+                  "dispensable.")
+        .NewInput("Max",
+                  "Pass the mix, min value as input, not attribute. Max is "
+                  "dispensable."));
diff --git a/paddle/fluid/operators/clip_op_xpu.cc b/paddle/fluid/operators/clip_op_xpu.cc
index c5513128372..a99e5d2506f 100644
--- a/paddle/fluid/operators/clip_op_xpu.cc
+++ b/paddle/fluid/operators/clip_op_xpu.cc
@@ -61,10 +61,11 @@ class ClipXPUKernel : public framework::OpKernel<T> {
     auto out_data = reinterpret_cast<XPUDataType*>(out->data<T>());
     int r = xpu::clip_v2(dev_ctx.x_context(), x_data, out_data, x->numel(), min,
                          max);
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                          "XPU API(clip_v2) return wrong "
-                                          "value[%d %s]",
-                                          r, XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(clip_v2) return wrong "
+                                   "value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
   }
 };
 
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index aa5a38e4dbf..af15ca2acb7 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -14,6 +14,7 @@
 
 #include <sstream>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -265,11 +266,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
           ->ShareDataWith(fused_tensor->Slice(
               static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
           .Resize(dim);
-      len = use_align
-                ? platform::Alignment(len * size_of_dtype, context.GetPlace(),
-                                      align_size) /
-                      size_of_dtype
-                : len;
+      len = use_align ? platform::Alignment(len * size_of_dtype,
+                                            context.GetPlace(), align_size) /
+                            size_of_dtype
+                      : len;
       ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
          << " address: " << out_tensors[i]->data() << " len: " << len << ", ";
       offset += len;
@@ -304,12 +304,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
           size, 0,
           platform::errors::InvalidArgument(
               "The number of tensor `%s`'s elements is 0.", var_names[i]));
-      auto len =
-          use_align
-              ? platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
-                                    place, align_size) /
-                    size_of_dtype
-              : static_cast<size_t>(size);
+      auto len = use_align ? platform::Alignment(
+                                 static_cast<size_t>(size) * size_of_dtype,
+                                 place, align_size) /
+                                 size_of_dtype
+                           : static_cast<size_t>(size);
       const void *ptr =
           lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
       VLOG(4) << size << " " << len;
diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc
index 63b135a74cf..53843104dc5 100644
--- a/paddle/fluid/operators/collective/allreduce_op.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/collective/allreduce_op.h"
+
 #include <future>  // NOLINT
 #include <ostream>
 
-#include "paddle/fluid/operators/collective/allreduce_op.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/collective/barrier_op.h b/paddle/fluid/operators/collective/barrier_op.h
index 6df4d24c0ed..88333f36413 100644
--- a/paddle/fluid/operators/collective/barrier_op.h
+++ b/paddle/fluid/operators/collective/barrier_op.h
@@ -25,6 +25,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/barrier.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
diff --git a/paddle/fluid/operators/collective/broadcast_op.cc b/paddle/fluid/operators/collective/broadcast_op.cc
index 61e27887b68..071b0350de6 100644
--- a/paddle/fluid/operators/collective/broadcast_op.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <ostream>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc
index c4e779698cc..f20ec75a970 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
@@ -26,8 +26,9 @@ class CAllGatherOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AllGather");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Input", "Out", "AllGather");
     int nranks = ctx->Attrs().Get<int>("nranks");
-    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
-                                     "The value of nranks should be >=2."));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The value of nranks should be >=2."));
     framework::DDim dim = ctx->GetInputDim("X");
     dim[0] = dim[0] * nranks;
     if (dim[0] < 0) dim[0] = -1;
diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h
index aa2040a2693..7f8c7b2f50e 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.h
+++ b/paddle/fluid/operators/collective/c_allgather_op.h
@@ -25,6 +25,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/allgather.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
index 5339293da0f..f9ffdea7908 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-
 #include <memory>
 
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
index 7206dd01bca..087f6b879c3 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -17,23 +17,22 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
index 0946ad8aca6..5c2d6981bad 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -17,23 +17,22 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 404f7c017ac..61cf4cf5b7f 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -41,6 +41,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/allreduce.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
@@ -335,10 +336,11 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
             "Invalid reduce type: %d", red_type));
     }
 
-    PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel,
-                                      dtype, bkcl_red_type, stream),
-                      BKCL_SUCCESS, platform::errors::PreconditionNotMet(
-                                        "BKCL all reduce failed"));
+    PADDLE_ENFORCE_EQ(
+        bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel, dtype,
+                        bkcl_red_type, stream),
+        BKCL_SUCCESS,
+        platform::errors::PreconditionNotMet("BKCL all reduce failed"));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should be compiled with XPU."));
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index 61e5f279034..4c76d094baf 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -17,20 +17,19 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.h b/paddle/fluid/operators/collective/c_broadcast_op.h
index eb4acb9a369..394ea45efbb 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.h
+++ b/paddle/fluid/operators/collective/c_broadcast_op.h
@@ -24,6 +24,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/broadcast.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index cf4d6a28744..e383e78c5dd 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -17,20 +17,19 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
index ce2da1f22f1..c9605f4d1b2 100644
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/platform/collective_helper.h"
 
@@ -53,9 +52,9 @@ class CCommInitAllOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
-// PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true,
-//                   platform::errors::PreconditionNotMet(
-//                       "CCommInitAllOp can run on gpu place only"));
+    // PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true,
+    //                   platform::errors::PreconditionNotMet(
+    //                       "CCommInitAllOp can run on gpu place only"));
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     std::vector<int> devices = Attr<std::vector<int>>("devices");
diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
index 86c966378cc..3ea24f6e654 100644
--- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <rccl.h>
 #endif
 #include <stdint.h>
+
 #include <ostream>
 #include <string>
 
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index 490747520d6..a41d4293c90 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -71,8 +71,9 @@ class CCommInitOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(place) || platform::is_xpu_place(place) ||
             platform::is_mlu_place(place),
-        true, platform::errors::PreconditionNotMet(
-                  "CCommInitOp can run on gpu or xpu or mlu place only."));
+        true,
+        platform::errors::PreconditionNotMet(
+            "CCommInitOp can run on gpu or xpu or mlu place only."));
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
diff --git a/paddle/fluid/operators/collective/c_concat_op.cc b/paddle/fluid/operators/collective/c_concat_op.cc
index 551fde21162..155db23a039 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cc
@@ -27,17 +27,19 @@ class CConcatOp : public framework::OperatorWithKernel {
     int nranks = ctx->Attrs().Get<int>("nranks");
     int rank = ctx->Attrs().Get<int>("rank");
     int ring_id = ctx->Attrs().Get<int>("ring_id");
-    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
-                                     "The number of ranks (%d) for c_concat "
-                                     "must be greater than 1.",
-                                     nranks));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The number of ranks (%d) for c_concat "
+                          "must be greater than 1.",
+                          nranks));
     PADDLE_ENFORCE_GE(
         ring_id, 0,
         platform::errors::InvalidArgument(
             "The ring_id (%d) for c_concat must be non-negative.", ring_id));
     PADDLE_ENFORCE_GE(
-        rank, 0, platform::errors::InvalidArgument(
-                     "The rank (%d) for c_concat must be non-negative.", rank));
+        rank, 0,
+        platform::errors::InvalidArgument(
+            "The rank (%d) for c_concat must be non-negative.", rank));
     PADDLE_ENFORCE_LT(rank, nranks,
                       platform::errors::InvalidArgument(
                           "The value of rank (%d) for c_concat must "
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
index d3d9db0e5f8..98df6c8688e 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/collective/c_concat_op.h"
+
 #include <vector>
 
-#include "paddle/fluid/operators/collective/c_concat_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/phi/api/include/tensor.h"
 
diff --git a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
index ec174ad0e56..3bd7e3ceffa 100644
--- a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
@@ -21,9 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_gen_cncl_id_op.cc b/paddle/fluid/operators/collective/c_gen_cncl_id_op.cc
index 7e65fba5718..d2e85171a4a 100644
--- a/paddle/fluid/operators/collective/c_gen_cncl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_cncl_id_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <cncl.h>
+
 #include <string>
 
 #include "paddle/fluid/framework/op_proto_maker.h"
@@ -21,9 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
index 6eec3853880..3f81eab7bc2 100644
--- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
@@ -19,12 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index d392beb3a48..d4f1fe1c182 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -20,9 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index 4e9edb53730..5399a4aacbe 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -40,6 +40,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/reduce.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
@@ -261,10 +262,11 @@ class CReduceOpXPUKernel : public framework::OpKernel<T> {
             "Invalid reduce type: %d", red_type));
     }
 
-    PADDLE_ENFORCE_EQ(bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel,
-                                  dtype, bkcl_red_type, root, stream),
-                      BKCL_SUCCESS, platform::errors::PreconditionNotMet(
-                                        "BKCL all reduce failed"));
+    PADDLE_ENFORCE_EQ(
+        bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel, dtype,
+                    bkcl_red_type, root, stream),
+        BKCL_SUCCESS,
+        platform::errors::PreconditionNotMet("BKCL all reduce failed"));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should be compiled with XPU."));
@@ -319,9 +321,10 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
         break;
 
       default:
-        PADDLE_ENFORCE_EQ(true, false, platform::errors::InvalidArgument(
-                                           "red_type must be one of kRedSum, "
-                                           "kRedMax, kRedMin, kRedProd."));
+        PADDLE_ENFORCE_EQ(true, false,
+                          platform::errors::InvalidArgument(
+                              "red_type must be one of kRedSum, "
+                              "kRedMax, kRedMin, kRedProd."));
     }
 
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
index c4e410d04da..3bd55ea3704 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -17,20 +17,19 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index 8b498787c69..16437d4769e 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -17,23 +17,22 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/c_scatter_op.h b/paddle/fluid/operators/collective/c_scatter_op.h
index 71a5f488ebc..ee07d7663b2 100644
--- a/paddle/fluid/operators/collective/c_scatter_op.h
+++ b/paddle/fluid/operators/collective/c_scatter_op.h
@@ -24,6 +24,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/scatter.h>
+
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index 4c9fb148424..71216538a4e 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -373,15 +373,15 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     const int end_index = start_index + D;
 
     if (label_type == framework::proto::VarType::INT32) {
-      MaskLabelByIndexGrad<T,
-                           int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          logit_grad_2d.data<T>(), loss_grad->data<T>(),
-          labels->data<int32_t>(), start_index, end_index, N, D);
+      MaskLabelByIndexGrad<T, int32_t>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(
+              logit_grad_2d.data<T>(), loss_grad->data<T>(),
+              labels->data<int32_t>(), start_index, end_index, N, D);
     } else if (label_type == framework::proto::VarType::INT64) {
-      MaskLabelByIndexGrad<T,
-                           int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          logit_grad_2d.data<T>(), loss_grad->data<T>(),
-          labels->data<int64_t>(), start_index, end_index, N, D);
+      MaskLabelByIndexGrad<T, int64_t>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(
+              logit_grad_2d.data<T>(), loss_grad->data<T>(),
+              labels->data<int64_t>(), start_index, end_index, N, D);
     }
   }
 };
diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc
index 37ec989f3f9..32f3ff9eab1 100644
--- a/paddle/fluid/operators/collective/c_split_op.cc
+++ b/paddle/fluid/operators/collective/c_split_op.cc
@@ -27,17 +27,19 @@ class CSplitOp : public framework::OperatorWithKernel {
     int nranks = ctx->Attrs().Get<int>("nranks");
     int rank = ctx->Attrs().Get<int>("rank");
     int ring_id = ctx->Attrs().Get<int>("ring_id");
-    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
-                                     "The number of ranks (%d) for c_split "
-                                     "must be greater than 1.",
-                                     nranks));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The number of ranks (%d) for c_split "
+                          "must be greater than 1.",
+                          nranks));
     PADDLE_ENFORCE_GE(
         ring_id, 0,
         platform::errors::InvalidArgument(
             "The ring_id (%d) for c_split must be non-negative.", ring_id));
     PADDLE_ENFORCE_GE(
-        rank, 0, platform::errors::InvalidArgument(
-                     "The rank (%d) for c_split must be non-negative.", rank));
+        rank, 0,
+        platform::errors::InvalidArgument(
+            "The rank (%d) for c_split must be non-negative.", rank));
     PADDLE_ENFORCE_LT(rank, nranks,
                       platform::errors::InvalidArgument(
                           "The value of rank (%d) for c_split must "
diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu
index a0c4182468f..1dce4ce04b5 100644
--- a/paddle/fluid/operators/collective/c_split_op.cu
+++ b/paddle/fluid/operators/collective/c_split_op.cu
@@ -59,10 +59,11 @@ class CSplitOpCUDAKernel : public framework::OpKernel<T> {
     int rank = ctx.Attr<int>("rank");
     auto place = ctx.GetPlace();
 
-    PADDLE_ENFORCE_GE(rank, 0, platform::errors::PreconditionNotMet(
-                                   "The value of rank (%d) for c_split must be "
-                                   "greater than or equal to 0.",
-                                   rank));
+    PADDLE_ENFORCE_GE(rank, 0,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_split must be "
+                          "greater than or equal to 0.",
+                          rank));
     PADDLE_ENFORCE_GE(nranks, 2,
                       platform::errors::PreconditionNotMet(
                           "The value of nranks (%d) for c_split must be "
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
index 133085ad3f3..91b89486c6a 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -26,11 +26,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
index 36c6f4fadd0..b99ac381635 100644
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -17,21 +17,20 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <cmath>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/gen_bkcl_id_op.cc b/paddle/fluid/operators/collective/gen_bkcl_id_op.cc
index 1ce89383568..f60030cec76 100644
--- a/paddle/fluid/operators/collective/gen_bkcl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_bkcl_id_op.cc
@@ -24,11 +24,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 
-#include "paddle/fluid/platform/gen_comm_id_helper.h"
-
 namespace paddle {
 namespace operators {
 
@@ -69,9 +68,10 @@ class GenBKCLIdOp : public framework::OperatorBase {
     int trainer_id = Attr<int>("trainer_id");
     std::string endpoint = trainers[trainer_id];
 
-    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
-                                         "trainer_id %d is less than 0. Its "
-                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_GE(
+        trainer_id, 0,
+        platform::errors::InvalidArgument("trainer_id %d is less than 0. Its "
+                                          "valid range is [0, trainer_size)"));
     PADDLE_ENFORCE_LT(
         trainer_id, static_cast<int>(trainers.size()),
         platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
index 3d78082f12f..e0809459be1 100644
--- a/paddle/fluid/operators/collective/gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
@@ -21,14 +21,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-
 namespace paddle {
 namespace operators {
 
@@ -48,9 +47,10 @@ class GenHCCLIdOp : public framework::OperatorBase {
     int trainer_id = Attr<int>("trainer_id");
     std::string endpoint = trainers[trainer_id];
 
-    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
-                                         "trainer_id %d is less than 0. Its "
-                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_GE(
+        trainer_id, 0,
+        platform::errors::InvalidArgument("trainer_id %d is less than 0. Its "
+                                          "valid range is [0, trainer_size)"));
     PADDLE_ENFORCE_LT(
         trainer_id, static_cast<int>(trainers.size()),
         platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
index ad50ac36750..ba573509bd1 100644
--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
 #include <arpa/inet.h>
 #include <netdb.h>
 #include <netinet/in.h>
diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
index 7a5b6b5f429..1e23f38c13a 100644
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
@@ -70,9 +70,10 @@ class GenNCCLIdOp : public framework::OperatorBase {
     int trainer_id = Attr<int>("trainer_id");
     std::string endpoint = trainers[trainer_id];
 
-    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
-                                         "trainer_id %d is less than 0. Its "
-                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_GE(
+        trainer_id, 0,
+        platform::errors::InvalidArgument("trainer_id %d is less than 0. Its "
+                                          "valid range is [0, trainer_size)"));
     PADDLE_ENFORCE_LT(
         trainer_id, static_cast<int>(trainers.size()),
         platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cc b/paddle/fluid/operators/collective/partial_allgather_op.cc
index bef2ff94d63..6783d2f0b45 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cc
@@ -26,8 +26,9 @@ class PartialAllGatherOp : public framework::OperatorWithKernel {
     int nranks = ctx->Attrs().Get<int>("nranks");
     int rank = ctx->Attrs().Get<int>("rank");
 
-    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
-                                     "The value of nranks should be >=2."));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The value of nranks should be >=2."));
     PADDLE_ENFORCE_EQ(
         (rank >= 0 && rank < nranks), true,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
index 0314bb7d5de..c727161d101 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/collective/partial_allgather_op.h"
 #include <memory>
 
+#include "paddle/fluid/operators/collective/partial_allgather_op.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cc b/paddle/fluid/operators/collective/partial_recv_op.cc
index 99b2169180c..df59f49cb3a 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/partial_recv_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/partial_recv_op_npu.cc b/paddle/fluid/operators/collective/partial_recv_op_npu.cc
index f14ce5f81f9..4704ab7683c 100644
--- a/paddle/fluid/operators/collective/partial_recv_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/partial_recv_op.h"
-
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 
@@ -55,8 +54,9 @@ class PartialRecvOpASCENDKernel : public framework::OpKernel<T> {
     int nranks = comm->nranks();
     int peer = ctx.Attr<int>("peer");
 
-    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
-                                     "The nranks must be 2, but (%d)", nranks));
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The nranks must be 2, but (%d)", nranks));
 
     int root = peer;
 
diff --git a/paddle/fluid/operators/collective/partial_send_op_npu.cc b/paddle/fluid/operators/collective/partial_send_op_npu.cc
index 31c74fcc196..8f53bd8fc5f 100644
--- a/paddle/fluid/operators/collective/partial_send_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/send_v2_op.h"
-
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 
@@ -52,8 +51,9 @@ class PartialSendOpASCENDKernel : public framework::OpKernel<T> {
     int nranks = comm->nranks();
     int rank = comm->rank();
 
-    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
-                                     "The nranks must be 2, but (%d)", nranks));
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The nranks must be 2, but (%d)", nranks));
 
     int root = rank;
 
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc
index 494665544f0..15da47e713b 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/recv_v2_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
index c31f1210f04..9aa1ab78869 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -61,8 +61,9 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
     int nranks = comm->nranks();
     int peer = ctx.Attr<int>("peer");
 
-    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
-                                     "The nranks must be 2, but (%d)", nranks));
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The nranks must be 2, but (%d)", nranks));
 
     int root = peer;
 
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index 6e02d362156..0022b6bf39d 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -17,20 +17,19 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/operators/collective/recv_v2_op.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
index 882630467a0..ee34026cb28 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -60,8 +60,9 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
     int nranks = comm->nranks();
     int rank = comm->rank();
 
-    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
-                                     "The nranks must be 2, but (%d)", nranks));
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                      platform::errors::InvalidArgument(
+                          "The nranks must be 2, but (%d)", nranks));
 
     int root = rank;
 
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index 57e3dd53cc7..9784e6ddc15 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -17,19 +17,19 @@ limitations under the License. */
 #endif
 
 #include <stdio.h>
+
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include "gtest/gtest.h"
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/operators/collective/send_v2_op.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc
index 1d187451c68..8bd60c77c46 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/common_infer_shape_functions.cc
@@ -61,12 +61,13 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
     PADDLE_ENFORCE_EQ(
         x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 ||
             y_dims_array[i] <= 1,
-        true, platform::errors::InvalidArgument(
-                  "Broadcast dimension mismatch. Operands could "
-                  "not be broadcast together with the shape of X = [%s] and "
-                  "the shape of Y = [%s]. Received [%d] in X is not equal to "
-                  "[%d] in Y at i:%d.",
-                  x_dims, y_dims, x_dims_array[i], y_dims_array[i], i));
+        true,
+        platform::errors::InvalidArgument(
+            "Broadcast dimension mismatch. Operands could "
+            "not be broadcast together with the shape of X = [%s] and "
+            "the shape of Y = [%s]. Received [%d] in X is not equal to "
+            "[%d] in Y at i:%d.",
+            x_dims, y_dims, x_dims_array[i], y_dims_array[i], i));
     if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) ||
         (x_dims_array[i] == 1 && y_dims_array[i] == 1)) {
       out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
diff --git a/paddle/fluid/operators/complex_op.cc b/paddle/fluid/operators/complex_op.cc
index 7241c92258e..d358f5765f9 100644
--- a/paddle/fluid/operators/complex_op.cc
+++ b/paddle/fluid/operators/complex_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/complex_op.h"
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
diff --git a/paddle/fluid/operators/complex_view_op.cc b/paddle/fluid/operators/complex_view_op.cc
index 763f936ec9c..92b48fe8b06 100644
--- a/paddle/fluid/operators/complex_view_op.cc
+++ b/paddle/fluid/operators/complex_view_op.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/operators/complex_view_op.cu b/paddle/fluid/operators/complex_view_op.cu
index 261881cb8d2..b62c0470dd6 100644
--- a/paddle/fluid/operators/complex_view_op.cu
+++ b/paddle/fluid/operators/complex_view_op.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/complex_view_op.h"
-
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/complex_view_op.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index a467f2dbee7..599fbcce39f 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -15,11 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/concat_op.h"
 
 #include <paddle/fluid/platform/complex.h>
+
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/infershape_utils.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index 50aca54c12d..746e0e7a056 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -17,11 +17,11 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
-
 #include "paddle/phi/kernels/concat_kernel.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
diff --git a/paddle/fluid/operators/concat_op_mlu.cc b/paddle/fluid/operators/concat_op_mlu.cc
index e8f6b2dc869..3d927af96e1 100644
--- a/paddle/fluid/operators/concat_op_mlu.cc
+++ b/paddle/fluid/operators/concat_op_mlu.cc
@@ -99,10 +99,11 @@ class ConcatGradMLUKernel : public framework::OpKernel<T> {
 
     axis = ComputeAxis(static_cast<int64_t>(axis),
                        static_cast<int64_t>(ins[0]->dims().size()));
-    PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument(
-                                   "concat_grad: axis should be larger than or "
-                                   "equal to 0, but received axis is %d.",
-                                   axis));
+    PADDLE_ENFORCE_GE(axis, 0,
+                      platform::errors::InvalidArgument(
+                          "concat_grad: axis should be larger than or "
+                          "equal to 0, but received axis is %d.",
+                          axis));
     PADDLE_ENFORCE_LT(
         axis, out_grad->dims().size(),
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc
index ba35098bbac..fcbfc6f7a2b 100644
--- a/paddle/fluid/operators/concat_op_xpu.cc
+++ b/paddle/fluid/operators/concat_op_xpu.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/concat_op.h"
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
+#include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/phi/core/lod_utils.h"
 
 namespace paddle {
@@ -33,17 +33,19 @@ class ConcatXPUKernel : public framework::OpKernel<T> {
     auto ins = ctx.MultiInput<framework::LoDTensor>("X");
     framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
     int axis = ctx.Attr<int>("axis");
-    PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument(
-                                           "The input should not be null."));
+    PADDLE_ENFORCE_NE(
+        ins[0], nullptr,
+        platform::errors::InvalidArgument("The input should not be null."));
     PADDLE_ENFORCE_NE(ctx.HasInput("AxisTensor"), true,
                       platform::errors::InvalidArgument(
                           "XPU donot surpport AxisTensor for now"));
     axis = ComputeAxis(static_cast<int64_t>(axis),
                        static_cast<int64_t>(ins[0]->dims().size()));
-    PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument(
-                                   "concat: axis should be larger than or "
-                                   "equal to 0, but received axis is %d.",
-                                   axis));
+    PADDLE_ENFORCE_GE(axis, 0,
+                      platform::errors::InvalidArgument(
+                          "concat: axis should be larger than or "
+                          "equal to 0, but received axis is %d.",
+                          axis));
     PADDLE_ENFORCE_LT(axis, ins[0]->dims().size(),
                       platform::errors::InvalidArgument(
                           "concat: axis should be less than ins[0]->dims()!"
@@ -94,8 +96,9 @@ class ConcatXPUKernel : public framework::OpKernel<T> {
       }
     }
 
-    PADDLE_ENFORCE_GT(xdims_list.size(), 0, platform::errors::InvalidArgument(
-                                                "No tensor need concat"));
+    PADDLE_ENFORCE_GT(
+        xdims_list.size(), 0,
+        platform::errors::InvalidArgument("No tensor need concat"));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
     int r = xpu::concat<XPUType>(dev_ctx.x_context(), ptrs,
@@ -129,8 +132,9 @@ class ConcatGradXPUKernel : public framework::OpKernel<T> {
         }
       }
     }
-    PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument(
-                                           "The input should not be null."));
+    PADDLE_ENFORCE_NE(
+        ins[0], nullptr,
+        platform::errors::InvalidArgument("The input should not be null."));
     auto axis = ctx.Attr<int>("axis");
     if (ctx.HasInput("AxisTensor")) {
       auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
@@ -149,10 +153,11 @@ class ConcatGradXPUKernel : public framework::OpKernel<T> {
         ptrs[j] = nullptr;
       }
     }
-    PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument(
-                                   "concat_grad: axis should be larger than or "
-                                   "equal to 0, but received axis is %d.",
-                                   axis));
+    PADDLE_ENFORCE_GE(axis, 0,
+                      platform::errors::InvalidArgument(
+                          "concat_grad: axis should be larger than or "
+                          "equal to 0, but received axis is %d.",
+                          axis));
     PADDLE_ENFORCE_LT(
         axis, out_grad->dims().size(),
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc
index cbec1182f20..0c294b60482 100644
--- a/paddle/fluid/operators/conj_op.cc
+++ b/paddle/fluid/operators/conj_op.cc
@@ -74,8 +74,9 @@ REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker,
                   ConjInferShapeFunctor);
 
 REGISTER_OP_CPU_KERNEL(
-    conj, ops::ConjKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::complex<float>>,
+    conj,
+    ops::ConjKernel<paddle::platform::CPUDeviceContext,
+                    paddle::platform::complex<float>>,
     ops::ConjKernel<paddle::platform::CPUDeviceContext,
                     paddle::platform::complex<double>>,
     ops::ConjKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/conj_op.cu b/paddle/fluid/operators/conj_op.cu
index d04024d70a8..548508636ca 100644
--- a/paddle/fluid/operators/conj_op.cu
+++ b/paddle/fluid/operators/conj_op.cu
@@ -17,8 +17,9 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    conj, ops::ConjKernel<paddle::platform::CUDADeviceContext,
-                          paddle::platform::complex<float>>,
+    conj,
+    ops::ConjKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::complex<float>>,
     ops::ConjKernel<paddle::platform::CUDADeviceContext,
                     paddle::platform::complex<double>>,
     ops::ConjKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cc b/paddle/fluid/operators/controlflow/bitwise_op.cc
index 4dcbbc8568f..19865f9a9fb 100644
--- a/paddle/fluid/operators/controlflow/bitwise_op.cc
+++ b/paddle/fluid/operators/controlflow/bitwise_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 72d81d8c3fd..21fc69eb019 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -80,14 +80,12 @@ class CompareOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_COMPARE_OP_VERSION(op_type)                               \
-  REGISTER_OP_VERSION(op_type)                                             \
-      .AddCheckpoint(                                                      \
-          R"ROC(Upgrade compare ops, add a new attribute [force_cpu])ROC", \
-          paddle::framework::compatible::OpVersionDesc().ModifyAttr(       \
-              "force_cpu",                                                 \
-              "In order to force fill output variable to gpu memory.",     \
-              false));
+#define REGISTER_COMPARE_OP_VERSION(op_type)                           \
+  REGISTER_OP_VERSION(op_type).AddCheckpoint(                          \
+      R"ROC(Upgrade compare ops, add a new attribute [force_cpu])ROC", \
+      paddle::framework::compatible::OpVersionDesc().ModifyAttr(       \
+          "force_cpu",                                                 \
+          "In order to force fill output variable to gpu memory.", false));
 
 #define REGISTER_COMPARE_OP(op_type, _equation)                          \
   struct _##op_type##Comment {                                           \
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index c024e4a12cd..c1d13ffdf12 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -68,10 +68,11 @@ class ConditionalOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(framework::TransToProtoVarType(ips[0]->dtype()) ==
                               framework::proto::VarType::BOOL &&
                           ips[0]->numel() == 1,
-                      true, platform::errors::InvalidArgument(
-                                "condition input's data type should be bool, "
-                                "numel should be 1, actual numel is %d",
-                                ips[0]->numel()));
+                      true,
+                      platform::errors::InvalidArgument(
+                          "condition input's data type should be bool, "
+                          "numel should be 1, actual numel is %d",
+                          ips[0]->numel()));
     bool res = false;
     if (platform::is_gpu_place(ips[0]->place())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index 111ca9c63c6..369a1ffedc4 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -35,10 +35,11 @@ static void DataCopy(const framework::LoDTensor &src_item,
       // as params are not a subject to paddle's data_format
       VLOG(4) << "innerTransDataLayoutFromMKLDNN";
       framework::innerTransDataLayoutFromMKLDNN(
-          src_item.layout(), fetch_var_name == framework::GradVarName("Filter")
-                                 ? framework::DataLayout::kNCHW
-                                 : paddle::platform::MKLDNNDeviceContext::tls()
-                                       .get_cur_paddle_data_layout(),
+          src_item.layout(),
+          fetch_var_name == framework::GradVarName("Filter")
+              ? framework::DataLayout::kNCHW
+              : paddle::platform::MKLDNNDeviceContext::tls()
+                    .get_cur_paddle_data_layout(),
           src_item, &out, platform::CPUPlace());
       paddle::framework::TensorCopySync(out, platform::CPUPlace(), dst_item);
     } else {
@@ -92,11 +93,12 @@ class FetchOp : public framework::OperatorBase {
 
     int col = Attr<int>("col");
     PADDLE_ENFORCE_GE(
-        col, 0, platform::errors::InvalidArgument(
-                    "Expected the column index (the attribute 'col' of "
-                    "operator 'Fetch') of current fetching variable to be "
-                    "no less than 0. But received column index = %d.",
-                    col));
+        col, 0,
+        platform::errors::InvalidArgument(
+            "Expected the column index (the attribute 'col' of "
+            "operator 'Fetch') of current fetching variable to be "
+            "no less than 0. But received column index = %d.",
+            col));
 
     VLOG(3) << "Fetch variable " << fetch_var_name << " to variable "
             << out_name << "'s " << col << " column.";
diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
index caa67139a9b..29d6eb1b2d4 100644
--- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
@@ -42,10 +42,11 @@ static void DeepCopy(const framework::LoDTensor &src_item,
       // Convert to desired Paddle layout, apart from grads of filter
       // as params are not a subject to paddle's data_format
       framework::innerTransDataLayoutFromMKLDNN(
-          src_item.layout(), fetch_var_name == framework::GradVarName("Filter")
-                                 ? framework::DataLayout::kNCHW
-                                 : paddle::platform::MKLDNNDeviceContext::tls()
-                                       .get_cur_paddle_data_layout(),
+          src_item.layout(),
+          fetch_var_name == framework::GradVarName("Filter")
+              ? framework::DataLayout::kNCHW
+              : paddle::platform::MKLDNNDeviceContext::tls()
+                    .get_cur_paddle_data_layout(),
           src_item, &out, platform::CPUPlace());
       paddle::framework::TensorCopySync(out, platform::CPUPlace(), dst_item);
     } else {
@@ -123,11 +124,12 @@ class FetchV2Kernel {
 
     int col = ctx.Attr<int>("col");
     PADDLE_ENFORCE_GE(
-        col, 0, platform::errors::InvalidArgument(
-                    "Expected the column index (the attribute 'col' of "
-                    "operator 'Fetch') of current fetching variable to be "
-                    "no less than 0. But received column index = %d.",
-                    col));
+        col, 0,
+        platform::errors::InvalidArgument(
+            "Expected the column index (the attribute 'col' of "
+            "operator 'Fetch') of current fetching variable to be "
+            "no less than 0. But received column index = %d.",
+            col));
 
     auto *fetch_list = out_var->GetMutable<framework::FetchList>();
 
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 55bd4879ab7..7f3b0040041 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -62,9 +62,10 @@ class GetPlacesOp : public framework::OperatorBase {
       device_count =
           is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
     }
-    PADDLE_ENFORCE_NE(device_count, 0UL, platform::errors::InvalidArgument(
-                                             "Cannot indicate %s device count",
-                                             is_gpu ? "GPU" : "CPU"));
+    PADDLE_ENFORCE_NE(
+        device_count, 0UL,
+        platform::errors::InvalidArgument("Cannot indicate %s device count",
+                                          is_gpu ? "GPU" : "CPU"));
 
     auto out_var_name = Output("Out");
     auto &places = *(GET_DATA_SAFELY(scope.FindVar(out_var_name), "Output",
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index 4d11cb5ff74..a9c28f48ef7 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
diff --git a/paddle/fluid/operators/controlflow/op_variant.h b/paddle/fluid/operators/controlflow/op_variant.h
index cc1f36a875f..57d44b67939 100644
--- a/paddle/fluid/operators/controlflow/op_variant.h
+++ b/paddle/fluid/operators/controlflow/op_variant.h
@@ -50,8 +50,9 @@ class OpVariant {
   const AttrType &Attr(const std::string &name) const {
     auto &attrs = Attrs();
     auto it = attrs.find(name);
-    PADDLE_ENFORCE_NE(it, attrs.end(), platform::errors::NotFound(
-                                           "Cannot find attribute %s.", name));
+    PADDLE_ENFORCE_NE(
+        it, attrs.end(),
+        platform::errors::NotFound("Cannot find attribute %s.", name));
     return BOOST_GET_CONST(AttrType, it->second);
   }
 
diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
index 43913cae6b3..62cd2fc3376 100644
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index d8daa25f31b..a551bad8eb1 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -45,7 +45,7 @@ static std::string GetSkipEagerDeletionVarsDebugString(
   }
   return str;
 }
-}  // NOLINT
+}  // namespace
 
 class WhileOp : public framework::OperatorBase {
  public:
@@ -375,10 +375,11 @@ class WhileGradOp : public framework::OperatorBase {
           PADDLE_ENFORCE_EQ(
               var->IsType<framework::LoDTensorArray>() ||
                   var->IsType<LoDTensor>(),
-              true, platform::errors::InvalidArgument(
-                        "Currently the type of var only can be LoDTensorArray, "
-                        "or LoDTensor, but the received var[%s] is %s.",
-                        inside_grad_name, framework::ToTypeName(var->Type())));
+              true,
+              platform::errors::InvalidArgument(
+                  "Currently the type of var only can be LoDTensorArray, "
+                  "or LoDTensor, but the received var[%s] is %s.",
+                  inside_grad_name, framework::ToTypeName(var->Type())));
 
           if ((var_iter == outside_og_names.end()) &&
               var->IsType<LoDTensor>()) {
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index 63b273fdbb8..2b2001be6bf 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 
 #include <string>
+
 #include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h
index 9e1a323fc9f..f141c9eb087 100644
--- a/paddle/fluid/operators/conv_base_helper.h
+++ b/paddle/fluid/operators/conv_base_helper.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/conv_search_cache.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index af67d857e0e..3d704c8be30 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <functional>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index f084862b419..28ca2feeec5 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -19,15 +19,13 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/op_version_registry.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
@@ -864,16 +862,15 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad,
                   ops::Conv3DDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad);
 
-REGISTER_OP_VERSION(conv2d)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(conv2d).AddCheckpoint(
+    R"ROC(
       Upgrade conv2d, add a new attribute [use_addto].
     )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "use_addto",
-            "In order to support new feature (inplace addto strategy) for "
-            "gradient accumulation.",
-            false));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "use_addto",
+        "In order to support new feature (inplace addto strategy) for "
+        "gradient accumulation.",
+        false));
 
 REGISTER_OP_VERSION(depthwise_conv2d)
     .AddCheckpoint(
@@ -886,13 +883,12 @@ REGISTER_OP_VERSION(depthwise_conv2d)
             "gradient accumulation.",
             false));
 
-REGISTER_OP_VERSION(conv3d)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(conv3d).AddCheckpoint(
+    R"ROC(
       Upgrade conv3d, add a new attribute [use_addto].
     )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "use_addto",
-            "In order to support new feature (inplace addto strategy) for "
-            "gradient accumulation.",
-            false));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "use_addto",
+        "In order to support new feature (inplace addto strategy) for "
+        "gradient accumulation.",
+        false));
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 58f2eeee256..644a827b488 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/layout_utils.h"
diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc
index 3ace825e7b8..15a5aa737ae 100644
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
@@ -130,12 +130,12 @@ class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
         "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}});
     runner_trans.Run(stream);
 
-    const auto& runner =
-        NpuOpRunner("DepthwiseConv2D", {input_tensor, transformed_filter},
-                    {output_tensor}, {{"strides", strides},
-                                      {"dilations", dilations},
-                                      {"pads", padding},
-                                      {"data_format", data_format}});
+    const auto& runner = NpuOpRunner(
+        "DepthwiseConv2D", {input_tensor, transformed_filter}, {output_tensor},
+        {{"strides", strides},
+         {"dilations", dilations},
+         {"pads", padding},
+         {"data_format", data_format}});
     runner.Run(stream);
   }
 };
@@ -392,14 +392,15 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
         filter_grad_fp32.ShareDataWith(*filter_grad);
       }
 
-      const auto& runner = NpuOpRunner(
-          "Conv2DBackpropFilterD", {input_tensor, output_grad_tensor},
-          {filter_grad_fp32}, {{"filter_size", filter_shape_vec},
-                               {"strides", strides_vec},
-                               {"pads", paddings},
-                               {"dilations", dilations_vec},
-                               {"groups", groups},
-                               {"data_format", data_format}});
+      const auto& runner =
+          NpuOpRunner("Conv2DBackpropFilterD",
+                      {input_tensor, output_grad_tensor}, {filter_grad_fp32},
+                      {{"filter_size", filter_shape_vec},
+                       {"strides", strides_vec},
+                       {"pads", paddings},
+                       {"dilations", dilations_vec},
+                       {"groups", groups},
+                       {"data_format", data_format}});
       runner.Run(stream);
 
       if (framework::TransToProtoVarType(input->dtype()) ==
@@ -418,12 +419,13 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
       }
       const auto& runner =
           NpuOpRunner("Conv2DBackpropInputD", {*filter, output_grad_tensor},
-                      {input_grad_tensor}, {{"input_size", input_shape_vec},
-                                            {"strides", strides_vec},
-                                            {"pads", paddings},
-                                            {"dilations", dilations_vec},
-                                            {"groups", groups},
-                                            {"data_format", data_format}});
+                      {input_grad_tensor},
+                      {{"input_size", input_shape_vec},
+                       {"strides", strides_vec},
+                       {"pads", paddings},
+                       {"dilations", dilations_vec},
+                       {"groups", groups},
+                       {"data_format", data_format}});
       runner.Run(stream);
     }
   }
@@ -452,11 +454,12 @@ class NPUConv3dKernel : public framework::OpKernel<T> {
                           "= [%s]",
                           data_format));
 
-    PADDLE_ENFORCE_EQ(groups, 1, platform::errors::Unimplemented(
-                                     "the groups must be 1 in "
-                                     "the npu kernel of conv3d, but got groups "
-                                     "= [%d]",
-                                     groups));
+    PADDLE_ENFORCE_EQ(groups, 1,
+                      platform::errors::Unimplemented(
+                          "the groups must be 1 in "
+                          "the npu kernel of conv3d, but got groups "
+                          "= [%d]",
+                          groups));
 
     output->mutable_data<T>(ctx.GetPlace());
 
@@ -537,11 +540,12 @@ class NPUConv3dGradKernel : public framework::OpKernel<T> {
                           "= [%s]",
                           data_format));
 
-    PADDLE_ENFORCE_EQ(groups, 1, platform::errors::Unimplemented(
-                                     "the groups must be 1 in "
-                                     "the npu kernel of conv3d, but got groups "
-                                     "= [%d]",
-                                     groups));
+    PADDLE_ENFORCE_EQ(groups, 1,
+                      platform::errors::Unimplemented(
+                          "the groups must be 1 in "
+                          "the npu kernel of conv3d, but got groups "
+                          "= [%d]",
+                          groups));
 
     auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
     auto input_tensor =
@@ -593,14 +597,15 @@ class NPUConv3dGradKernel : public framework::OpKernel<T> {
       filter_grad_tensor.ShareDataWith(*filter_grad);
       filter_grad_tensor.set_layout(DataLayout::kNCDHW);
 
-      const auto& runner = NpuOpRunner(
-          "Conv3DBackpropFilterD", {input_tensor, output_grad_tensor},
-          {filter_grad_tensor}, {{"filter_size", filter_shape_vec},
-                                 {"strides", strides_vec},
-                                 {"pads", paddings},
-                                 {"dilations", dilations_vec},
-                                 {"groups", groups},
-                                 {"data_format", data_format}});
+      const auto& runner =
+          NpuOpRunner("Conv3DBackpropFilterD",
+                      {input_tensor, output_grad_tensor}, {filter_grad_tensor},
+                      {{"filter_size", filter_shape_vec},
+                       {"strides", strides_vec},
+                       {"pads", paddings},
+                       {"dilations", dilations_vec},
+                       {"groups", groups},
+                       {"data_format", data_format}});
       runner.Run(stream);
     }
 
@@ -613,14 +618,15 @@ class NPUConv3dGradKernel : public framework::OpKernel<T> {
       input_grad_tensor.ShareDataWith(*input_grad);
       input_grad_tensor.set_layout(DataLayout::kNCDHW);
 
-      const auto& runner = NpuOpRunner(
-          "Conv3DBackpropInputD", {filter_tensor, output_grad_tensor},
-          {input_grad_tensor}, {{"input_size", input_shape_vec},
-                                {"strides", strides_vec},
-                                {"pads", paddings},
-                                {"dilations", dilations_vec},
-                                {"groups", groups},
-                                {"data_format", data_format}});
+      const auto& runner =
+          NpuOpRunner("Conv3DBackpropInputD",
+                      {filter_tensor, output_grad_tensor}, {input_grad_tensor},
+                      {{"input_size", input_shape_vec},
+                       {"strides", strides_vec},
+                       {"pads", paddings},
+                       {"dilations", dilations_vec},
+                       {"groups", groups},
+                       {"data_format", data_format}});
       runner.Run(stream);
     }
   }
diff --git a/paddle/fluid/operators/conv_op_xpu.cc b/paddle/fluid/operators/conv_op_xpu.cc
index cc5c20d3928..d66eefc6946 100644
--- a/paddle/fluid/operators/conv_op_xpu.cc
+++ b/paddle/fluid/operators/conv_op_xpu.cc
@@ -8,10 +8,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/conv_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #ifdef PADDLE_WITH_XPU
 namespace paddle {
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
index e7af908eba2..e996021ed84 100644
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_shift_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/eigen.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index fe76fc3aebb..8b60c67f92e 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc
index 050ede78f72..c07be5a3fdb 100644
--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_npu.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/conv_transpose_op.h"
-
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/conv_transpose_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 
@@ -90,9 +89,9 @@ class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
     auto output_dim_vec = phi::vectorize(output_tensor.dims());
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    const auto& runner =
-        NpuOpRunner("Conv2DTransposeD", {input_tensor, *filter},
-                    {output_tensor}, {{"input_size", output_dim_vec},
+    const auto& runner = NpuOpRunner("Conv2DTransposeD",
+                                     {input_tensor, *filter}, {output_tensor},
+                                     {{"input_size", output_dim_vec},
                                       {"strides", strides},
                                       {"dilations", dilations},
                                       {"output_padding", output_padding},
@@ -167,14 +166,15 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     if (filter_grad) {
       filter_grad->mutable_data<T>(ctx.GetPlace());
-      const auto& runner = NpuOpRunner(
-          "Conv2DBackpropFilterD", {output_grad_tensor, input_tensor},
-          {*filter_grad}, {{"filter_size", phi::vectorize<int>(filter_dims)},
-                           {"strides", strides_vec},
-                           {"pads", paddings},
-                           {"dilations", dilations_vec},
-                           {"groups", groups},
-                           {"data_format", data_format}});
+      const auto& runner =
+          NpuOpRunner("Conv2DBackpropFilterD",
+                      {output_grad_tensor, input_tensor}, {*filter_grad},
+                      {{"filter_size", phi::vectorize<int>(filter_dims)},
+                       {"strides", strides_vec},
+                       {"pads", paddings},
+                       {"dilations", dilations_vec},
+                       {"groups", groups},
+                       {"data_format", data_format}});
       runner.Run(stream);
     }
     if (input_grad) {
@@ -184,13 +184,13 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
       if (channel_last) {
         input_grad_tensor.set_layout(DataLayout::kNHWC);
       }
-      const auto& runner =
-          NpuOpRunner("Conv2D", {output_grad_tensor, *filter},
-                      {input_grad_tensor}, {{"strides", strides_vec},
-                                            {"pads", paddings},
-                                            {"dilations", dilations_vec},
-                                            {"groups", groups},
-                                            {"data_format", data_format}});
+      const auto& runner = NpuOpRunner("Conv2D", {output_grad_tensor, *filter},
+                                       {input_grad_tensor},
+                                       {{"strides", strides_vec},
+                                        {"pads", paddings},
+                                        {"dilations", dilations_vec},
+                                        {"groups", groups},
+                                        {"data_format", data_format}});
       runner.Run(stream);
     }
   }
diff --git a/paddle/fluid/operators/conv_transpose_op_xpu.cc b/paddle/fluid/operators/conv_transpose_op_xpu.cc
index b8bd3c4f006..ae25c57784f 100644
--- a/paddle/fluid/operators/conv_transpose_op_xpu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_xpu.cc
@@ -9,12 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/conv_transpose_op.h"
-
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/conv_transpose_op.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 
diff --git a/paddle/fluid/operators/correlation_op.cc b/paddle/fluid/operators/correlation_op.cc
index 62e0f311d15..21258958549 100644
--- a/paddle/fluid/operators/correlation_op.cc
+++ b/paddle/fluid/operators/correlation_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu
index f488cc12e64..f9dd9ab98a3 100644
--- a/paddle/fluid/operators/correlation_op.cu
+++ b/paddle/fluid/operators/correlation_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 #ifdef __HIPCC__
@@ -227,11 +228,11 @@ class CorrelationCUDAKernel : public framework::OpKernel<T> {
     dim3 threadsPerBlock(THREADS_PER_BLOCK);
     dim3 totalBlocksCorr(N, OH, OW);
 
-    correlation_forward<
-        T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
-        output->data<T>(), OC, OH, OW, rinput1.data<T>(), C, H, W,
-        rinput2.data<T>(), pad_size, kernel_size, max_displacement, stride1,
-        stride2);
+    correlation_forward<T>
+        <<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+            output->data<T>(), OC, OH, OW, rinput1.data<T>(), C, H, W,
+            rinput2.data<T>(), pad_size, kernel_size, max_displacement, stride1,
+            stride2);
   }
 };
 
@@ -472,19 +473,19 @@ class CorrelationCUDAGradKernel : public framework::OpKernel<T> {
     dim3 totalBlocksCorr(H, W, C);
 
     for (int n = 0; n < N; n++) {
-      correlation_backward_input1<
-          T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
-          n, grad_input1->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH,
-          GOW, rinput2.data<T>(), pad_size, kernel_size, max_displacement,
-          stride1, stride2);
+      correlation_backward_input1<T>
+          <<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+              n, grad_input1->data<T>(), C, H, W, grad_output->data<T>(), GOC,
+              GOH, GOW, rinput2.data<T>(), pad_size, kernel_size,
+              max_displacement, stride1, stride2);
     }
 
     for (int n = 0; n < N; n++) {
-      correlation_backward_input2<
-          T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
-          n, grad_input2->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH,
-          GOW, rinput1.data<T>(), pad_size, kernel_size, max_displacement,
-          stride1, stride2);
+      correlation_backward_input2<T>
+          <<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+              n, grad_input2->data<T>(), C, H, W, grad_output->data<T>(), GOC,
+              GOH, GOW, rinput1.data<T>(), pad_size, kernel_size,
+              max_displacement, stride1, stride2);
     }
   }
 };
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index d41ceafba1a..4c0c5596e5d 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cos_sim_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index 6d3e6e34c3b..fa080b7a4b4 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -158,11 +158,12 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
         PADDLE_ENFORCE_EQ(
             (label_dims.size() == 2UL && label_dims[1] == 1) ||
                 label_dims.size() == 1UL,
-            true, platform::errors::InvalidArgument(
-                      "The Input(Label) should be a 2-D tensor with last "
-                      "dimension fixed to 1 or a 1-D tensor. But received: "
-                      "input rank %u, input shape [%s].",
-                      label_dims.size(), label_dims));
+            true,
+            platform::errors::InvalidArgument(
+                "The Input(Label) should be a 2-D tensor with last "
+                "dimension fixed to 1 or a 1-D tensor. But received: "
+                "input rank %u, input shape [%s].",
+                label_dims.size(), label_dims));
       }
       if (ctx->IsRuntime() || (emission_dims[0] > 0 && label_dims[0] > 0)) {
         PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 6b11ff69c30..8b40abf3deb 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <limits>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/jit/kernels.h"
@@ -22,8 +23,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::LoDTensor;
 using framework::LoD;
+using framework::LoDTensor;
 using framework::Tensor;
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index 9de5bc6ea36..2e0a054fa12 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/crop_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index 5ac28fafb09..49e1d6ab584 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
@@ -171,17 +172,19 @@ class CropGradKernel : public framework::OpKernel<T> {
     size_t rank =
         context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
     PADDLE_ENFORCE_GE(
-        rank, 1, platform::errors::InvalidArgument(
-                     "The number of dimensions of the input 'Out@GRAD' for "
-                     "CropGrad must be greater than or equal "
-                     "to 1, but the value received is %d.",
-                     rank));
+        rank, 1,
+        platform::errors::InvalidArgument(
+            "The number of dimensions of the input 'Out@GRAD' for "
+            "CropGrad must be greater than or equal "
+            "to 1, but the value received is %d.",
+            rank));
     PADDLE_ENFORCE_LE(
-        rank, 6, platform::errors::InvalidArgument(
-                     "The number of dimensions of the input 'Out@GRAD' for "
-                     "CropGrad must be less than or equal "
-                     "to 6, but the value received is %d.",
-                     rank));
+        rank, 6,
+        platform::errors::InvalidArgument(
+            "The number of dimensions of the input 'Out@GRAD' for "
+            "CropGrad must be less than or equal "
+            "to 6, but the value received is %d.",
+            rank));
     switch (rank) {
       case 1:
         CropGradFunction<DeviceContext, T, 1>(context);
diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc
index 0e53bbb5d18..a9a94e2c948 100644
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/crop_tensor_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/crop_tensor_op.h b/paddle/fluid/operators/crop_tensor_op.h
index 409458037a2..851d007896d 100644
--- a/paddle/fluid/operators/crop_tensor_op.h
+++ b/paddle/fluid/operators/crop_tensor_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
@@ -72,11 +73,12 @@ static framework::DDim ValidateShape(const std::vector<int> shape,
                             "The value (%d) of the %uth element for shape of "
                             "Op(crop_tensor) should not be zero.",
                             shape[i], i));
-      PADDLE_ENFORCE_EQ(shape[i], -1, platform::errors::InvalidArgument(
-                                          "When the value (%d) of the %uth "
-                                          "element for shape of Op(crop_tensor)"
-                                          " is negative, only -1 is supported.",
-                                          shape[i], i));
+      PADDLE_ENFORCE_EQ(shape[i], -1,
+                        platform::errors::InvalidArgument(
+                            "When the value (%d) of the %uth "
+                            "element for shape of Op(crop_tensor)"
+                            " is negative, only -1 is supported.",
+                            shape[i], i));
       output_shape[i] = in_dims[i] - offsets[i];
     } else {
       output_shape[i] = static_cast<int64_t>(shape[i]);
@@ -226,11 +228,12 @@ class CropTensorKernel : public framework::OpKernel<T> {
             "value received is %d.",
             rank));
     PADDLE_ENFORCE_LE(
-        rank, 6, platform::errors::InvalidArgument(
-                     "The number of dimensions of the input 'x' for "
-                     "Op(crop_tensor) must be less than or equal to 6, but the "
-                     "value received is %d.",
-                     rank));
+        rank, 6,
+        platform::errors::InvalidArgument(
+            "The number of dimensions of the input 'x' for "
+            "Op(crop_tensor) must be less than or equal to 6, but the "
+            "value received is %d.",
+            rank));
     switch (rank) {
       case 1:
         CropTensorFunction<DeviceContext, T, 1>(context);
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 4f5912c81ba..a880584f4cf 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc
index 674b75625d1..977d84e1e47 100644
--- a/paddle/fluid/operators/cross_op.cc
+++ b/paddle/fluid/operators/cross_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
@@ -21,8 +22,8 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 using framework::DDim;
+using framework::Tensor;
 const int kDefaultDim = framework::DDim::kMaxRank;
 
 class CrossOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index ba90c677570..10ec5a6bdd1 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -15,7 +15,9 @@ limitations under the License. */
 #include <stdio.h>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+
 #include <vector>
+
 #include "paddle/fluid/operators/ctc_align_op.h"
 
 namespace paddle {
@@ -92,10 +94,10 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
       auto* output_length = ctx.Output<LoDTensor>("OutputLength");
       T* output_length_data =
           output_length->mutable_data<T>({input_dims[0], 1}, ctx.GetPlace());
-      PaddingMergeAndDelCudaKernel<
-          T><<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>(
-          input_dims[1], tokens, input_length_data, blank, merge_repeated,
-          padding_value, input_dims[0], output_data, output_length_data);
+      PaddingMergeAndDelCudaKernel<T>
+          <<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>(
+              input_dims[1], tokens, input_length_data, blank, merge_repeated,
+              padding_value, input_dims[0], output_data, output_length_data);
     } else {
       const size_t level = 0;
       auto input_lod = framework::ToAbsOffset(input->lod());
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index c561974b0c9..9e189a9fb63 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <string.h>
+
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h
index 5451cf815ca..da8284b4f2e 100644
--- a/paddle/fluid/operators/cudnn_lstm_cache.h
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index ccb0062fcc7..9ff4f796995 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
index 6c059257b94..e2159a09c12 100644
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 11633fb0b87..dbb703e7e87 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -86,13 +86,12 @@ REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker,
                   ops::CumsumGradMaker<paddle::imperative::OpBase>,
                   CumsumInferShapeFunctor);
 
-REGISTER_OP_VERSION(cumsum)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(cumsum).AddCheckpoint(
+    R"ROC(
       Upgrade cumsum add a new attribute [flatten].
     )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "flatten",
-            "In order to compute the cumsum over the flattened array when the "
-            "argument `axis` in python API is None.",
-            false));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "flatten",
+        "In order to compute the cumsum over the flattened array when the "
+        "argument `axis` in python API is None.",
+        false));
diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc
index e909906da7b..912167cec5a 100644
--- a/paddle/fluid/operators/cvm_op.cc
+++ b/paddle/fluid/operators/cvm_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cvm_op.h"
+
 #include <memory>
+
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 137de2d5af9..8287654949e 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/data_norm_op.h"
+
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/data_layout.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -163,10 +165,11 @@ class DataNormOp : public framework::OperatorWithKernel {
                       OperatorWithKernel::IndicateVarDataType(ctx, "BatchSum"),
                       platform::errors::InvalidArgument(
                           "BatchSum input should be of float type"));
-    PADDLE_ENFORCE_EQ(dn_param_type, OperatorWithKernel::IndicateVarDataType(
-                                         ctx, "BatchSquareSum"),
-                      platform::errors::InvalidArgument(
-                          "BatchSquareSum input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        dn_param_type,
+        OperatorWithKernel::IndicateVarDataType(ctx, "BatchSquareSum"),
+        platform::errors::InvalidArgument(
+            "BatchSquareSum input should be of float type"));
 
     bool enable_scale_and_shift = ctx.Attr<bool>("enable_scale_and_shift");
     if (enable_scale_and_shift) {
@@ -277,8 +280,9 @@ class DataNormKernel<platform::CPUDeviceContext, T>
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::InvalidArgument(
-                                            "The Input dim size should be 2"));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument("The Input dim size should be 2"));
     const int N = x_dims[0];
     const int C =
         (data_layout == DataLayout::kNCHW ? x_dims[1]
@@ -515,8 +519,9 @@ class DataNormGradKernel<platform::CPUDeviceContext, T>
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::InvalidArgument(
-                                            "The Input dim size should be 2"));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument("The Input dim size should be 2"));
     const int N = x_dims[0];
     const int C =
         (data_layout == DataLayout::kNCHW ? x_dims[1]
@@ -757,10 +762,9 @@ REGISTER_OP_CPU_KERNEL(
     data_norm_grad,
     ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_VERSION(data_norm)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(data_norm).AddCheckpoint(
+    R"ROC(
               upgrad data_norm op by adding scale_w to support scale and shift.)ROC",
-        paddle::framework::compatible::OpVersionDesc().NewInput(
-            "scale_w",
-            "scale_w is used to do scale duirng data_norm like batchnorm "));
+    paddle::framework::compatible::OpVersionDesc().NewInput(
+        "scale_w",
+        "scale_w is used to do scale duirng data_norm like batchnorm "));
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index 28a79221201..21c7d7d4bf4 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/data_norm_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -100,8 +101,9 @@ class DataNormKernel<platform::CUDADeviceContext, T>
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
     // Align with CPU version, but should we add this restriction?
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::PreconditionNotMet(
-                                            "The Input dim size should be 2"));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2,
+        platform::errors::PreconditionNotMet("The Input dim size should be 2"));
     const int N = x_dims[0];
     const int C = x_dims[1];
     const T *batch_size_in = ctx.Input<Tensor>("BatchSize")->data<T>();
@@ -143,8 +145,9 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
 
     const auto &x_dims = x->dims();
     // Align with CPU version, but should we add this restriction?
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::PreconditionNotMet(
-                                            "The Input dim size should be 2"));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2,
+        platform::errors::PreconditionNotMet("The Input dim size should be 2"));
     const int N = x_dims[0];
     const int C = x_dims[1];
 
diff --git a/paddle/fluid/operators/decode_jpeg_op.cu b/paddle/fluid/operators/decode_jpeg_op.cu
index de6b35bc9cd..a257afc50f9 100644
--- a/paddle/fluid/operators/decode_jpeg_op.cu
+++ b/paddle/fluid/operators/decode_jpeg_op.cu
@@ -15,6 +15,7 @@
 #if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP)
 
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/dynload/nvjpeg.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc
index 1b76aca1e66..b54c8a81abd 100644
--- a/paddle/fluid/operators/deformable_conv_op.cc
+++ b/paddle/fluid/operators/deformable_conv_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/deformable_conv_op_xpu.cc b/paddle/fluid/operators/deformable_conv_op_xpu.cc
index 240e5658956..d977cfe844a 100644
--- a/paddle/fluid/operators/deformable_conv_op_xpu.cc
+++ b/paddle/fluid/operators/deformable_conv_op_xpu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
@@ -169,28 +170,32 @@ class DeformableConvGradXPUKernel : public framework::OpKernel<T> {
     const float* offset_ptr = offset.data<float>();
     const float* mask_ptr = mask.data<float>();
     if (dx_data == nullptr) {
-      PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&dx_data),
-                                   input->numel() * sizeof(T)),
-                        XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                         "XPU has no enough memory"));
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void**>(&dx_data),
+                     input->numel() * sizeof(T)),
+          XPU_SUCCESS,
+          platform::errors::ResourceExhausted("XPU has no enough memory"));
     }
     if (dw_data == nullptr) {
-      PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&dw_data),
-                                   filter.numel() * sizeof(T)),
-                        XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                         "XPU has no enough memory"));
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void**>(&dw_data),
+                     filter.numel() * sizeof(T)),
+          XPU_SUCCESS,
+          platform::errors::ResourceExhausted("XPU has no enough memory"));
     }
     if (doffset_data == nullptr) {
-      PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&doffset_data),
-                                   offset.numel() * sizeof(T)),
-                        XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                         "XPU has no enough memory"));
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void**>(&doffset_data),
+                     offset.numel() * sizeof(T)),
+          XPU_SUCCESS,
+          platform::errors::ResourceExhausted("XPU has no enough memory"));
     }
     if (dmask_data == nullptr) {
-      PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&dmask_data),
-                                   mask.numel() * sizeof(T)),
-                        XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                         "XPU has no enough memory"));
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void**>(&dmask_data),
+                     mask.numel() * sizeof(T)),
+          XPU_SUCCESS,
+          platform::errors::ResourceExhausted("XPU has no enough memory"));
     }
 
     int input_dim = input->numel() / input->dims()[0];
@@ -207,10 +212,11 @@ class DeformableConvGradXPUKernel : public framework::OpKernel<T> {
     int f = filter.dims()[0];
 
     T* filter_grad_tmp = nullptr;
-    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&filter_grad_tmp),
-                                 filter_grad->numel() * sizeof(T)),
-                      XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                       "XPU has no enough memory"));
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&filter_grad_tmp),
+                   filter_grad->numel() * sizeof(T)),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted("XPU has no enough memory"));
 
     // set zeros for d_table_data
     const int zero = 0;
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cc b/paddle/fluid/operators/deformable_conv_v1_op.cc
index 0ec95cb54ba..2da561c8685 100644
--- a/paddle/fluid/operators/deformable_conv_v1_op.cc
+++ b/paddle/fluid/operators/deformable_conv_v1_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
index 7e7cdbd8d17..a989e3f9217 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
+
 #include <iostream>
 #include <memory>
 #include <vector>
+
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
@@ -165,11 +167,12 @@ class DeformablePSROIPoolOp : public framework::OperatorWithKernel {
     auto part_width = part_size[1];
     auto sample_per_part = ctx->Attrs().Get<int>("sample_per_part");
     auto trans_std = ctx->Attrs().Get<float>("trans_std");
-    PADDLE_ENFORCE_GE(trans_std, 0., platform::errors::InvalidArgument(
-                                         "Input(trans_std) should not be lower "
-                                         "than 0.0, but received trans_std "
-                                         "is:%f",
-                                         trans_std));
+    PADDLE_ENFORCE_GE(trans_std, 0.,
+                      platform::errors::InvalidArgument(
+                          "Input(trans_std) should not be lower "
+                          "than 0.0, but received trans_std "
+                          "is:%f",
+                          trans_std));
     PADDLE_ENFORCE_GE(
         input_dims[1], output_channels,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index 873950b2d2f..174f045c160 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -23,10 +23,12 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <algorithm>
 #include <iostream>
 #include <limits>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h
index 3deabce54ed..6ff6ab20df2 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -25,6 +25,7 @@
 #include <algorithm>
 #include <iostream>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/dequantize_op.cc b/paddle/fluid/operators/dequantize_op.cc
index 876bd1199ad..2bed296efd7 100644
--- a/paddle/fluid/operators/dequantize_op.cc
+++ b/paddle/fluid/operators/dequantize_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dequantize_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -47,8 +48,8 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker);
 
 REGISTER_OP_VERSION(dequantize)
-    .AddCheckpoint(
-        R"ROC( Add a new attribute [Shift])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "Shift", "Dequantize data to uint8 if provided non-zero value.",
-            0.0f));
+    .AddCheckpoint(R"ROC( Add a new attribute [Shift])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewAttr(
+                       "Shift",
+                       "Dequantize data to uint8 if provided non-zero value.",
+                       0.0f));
diff --git a/paddle/fluid/operators/dequantize_op.h b/paddle/fluid/operators/dequantize_op.h
index 75c27a06c21..ea7a08c8f36 100644
--- a/paddle/fluid/operators/dequantize_op.h
+++ b/paddle/fluid/operators/dequantize_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/dequeue_op.cc b/paddle/fluid/operators/dequeue_op.cc
index fb5d53dacf0..1a6286b0a32 100644
--- a/paddle/fluid/operators/dequeue_op.cc
+++ b/paddle/fluid/operators/dequeue_op.cc
@@ -14,6 +14,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
index 0bcb56d7aa8..b3d490ac0b5 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.h
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index 7bbbbe7f40e..b9b9b0b0c0d 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -122,8 +123,9 @@ void BboxOverlaps(const framework::Tensor& r_boxes,
       inter_h = std::max(y_max - y_min + 1, zero);
       inter_area = inter_w * inter_h;
       overlaps_et(i, j) =
-          (inter_area == 0.) ? 0 : inter_area /
-                                       (r_box_area + c_box_area - inter_area);
+          (inter_area == 0.)
+              ? 0
+              : inter_area / (r_box_area + c_box_area - inter_area);
     }
   }
 }
diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc
index 73f0607fdde..08d688a1495 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cc
+++ b/paddle/fluid/operators/detection/box_clip_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/box_clip_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index 65f2a559071..672b9a5db95 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -12,6 +12,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/box_clip_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h
index 13ba7894d60..4bcc81dbf98 100644
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 69d829e0021..461dcb7f39a 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/box_coder_op.h"
+
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu
index 22dc606df9d..b7dee412ee3 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/box_coder_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
index a626f790fac..6ddfd717653 100644
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
index d3565f87f33..7eed920fb3d 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
@@ -13,6 +13,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
index 92c9ab34aa4..b1b8c3ba2da 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License.*/
 
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 860fdd01794..bea6fb17488 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -18,6 +18,7 @@ namespace cub = hipcub;
 #endif
 
 #include <paddle/fluid/memory/allocation/allocator.h>
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index e5ae9a6ccbd..973cbc6ec16 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -20,6 +20,7 @@ limitations under the License.*/
 #include <numeric>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h
index adc2723acbf..0912ce90160 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/operators/detection/prior_box_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
index 4e514e62f40..e382586ec66 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 7ad25e003b4..5adf1469ec2 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -21,6 +21,7 @@ namespace cub = hipcub;
 #endif
 
 #include <paddle/fluid/memory/allocation/allocator.h>
+
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index 5479e08c2a5..85db2437ee5 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <cstring>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index c9cc4e72207..da86502f78c 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <math.h>
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index cbf17048400..bc528060355 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <math.h>
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index d6130823271..a6d2d8a2a01 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cstring>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 5fb7973fd89..20efb1fa6ca 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #include <paddle/fluid/memory/allocation/allocator.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 1f1802574c5..b8b6118058f 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cstring>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
index 005309e8ee5..deb7f3a41df 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #include <paddle/fluid/memory/allocation/allocator.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc
index 6b1b0cd8b35..4dea559d8e4 100644
--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
@@ -24,6 +24,7 @@
  **/
 
 #include "paddle/fluid/operators/detection/gpc.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace gpc {
diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
index 8cc0ebcab61..3f8bc867418 100644
--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 limitations under the License. */
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
 
@@ -51,16 +52,17 @@ class LocalityAwareNMSOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           box_dims[2] == 4 || box_dims[2] == 8 || box_dims[2] == 16 ||
               box_dims[2] == 24 || box_dims[2] == 32,
-          true, platform::errors::InvalidArgument(
-                    "The last dimension of Input(BBoxes) must be 4 or 8, "
-                    "represents the layout of coordinate "
-                    "[xmin, ymin, xmax, ymax] or "
-                    "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
-                    "8 points: [xi, yi] i= 1,2,...,8 or "
-                    "12 points: [xi, yi] i= 1,2,...,12 or "
-                    "16 points: [xi, yi] i= 1,2,...,16. "
-                    "But received %d.",
-                    box_dims[2]));
+          true,
+          platform::errors::InvalidArgument(
+              "The last dimension of Input(BBoxes) must be 4 or 8, "
+              "represents the layout of coordinate "
+              "[xmin, ymin, xmax, ymax] or "
+              "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
+              "8 points: [xi, yi] i= 1,2,...,8 or "
+              "12 points: [xi, yi] i= 1,2,...,12 or "
+              "16 points: [xi, yi] i= 1,2,...,16. "
+              "But received %d.",
+              box_dims[2]));
       PADDLE_ENFORCE_EQ(
           box_dims[1], score_dims[2],
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/detection/mask_util.cc b/paddle/fluid/operators/detection/mask_util.cc
index e06218cfe56..41505ee8428 100644
--- a/paddle/fluid/operators/detection/mask_util.cc
+++ b/paddle/fluid/operators/detection/mask_util.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/mask_util.h"
+
 #include <math.h>
 #include <stdlib.h>
+
 #include "paddle/fluid/memory/memory.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/mask_util.h b/paddle/fluid/operators/detection/mask_util.h
index 4e0ea54f6d8..25b03a11f7d 100644
--- a/paddle/fluid/operators/detection/mask_util.h
+++ b/paddle/fluid/operators/detection/mask_util.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <stdint.h>
+
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/mask_util_test.cc b/paddle/fluid/operators/detection/mask_util_test.cc
index de904e94746..68f7a6db648 100644
--- a/paddle/fluid/operators/detection/mask_util_test.cc
+++ b/paddle/fluid/operators/detection/mask_util_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/mask_util.h"
+
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/memory/memory.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc
index 3353739b01b..5eee52dfbc7 100644
--- a/paddle/fluid/operators/detection/matrix_nms_op.cc
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -405,7 +405,6 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(matrix_nms, ops::MatrixNMSKernel<float>,
                        ops::MatrixNMSKernel<double>);
 REGISTER_OP_VERSION(matrix_nms)
-    .AddCheckpoint(
-        R"ROC(Upgrade matrix_nms: add a new output [RoisNum].)ROC",
-        paddle::framework::compatible::OpVersionDesc().NewOutput(
-            "RoisNum", "The number of RoIs in each image."));
+    .AddCheckpoint(R"ROC(Upgrade matrix_nms: add a new output [RoisNum].)ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewOutput(
+                       "RoisNum", "The number of RoIs in each image."));
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 83cf6e5fd30..f603a501f4b 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 limitations under the License. */
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
 
@@ -55,18 +56,19 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
                             ". But received rank = %d",
                             box_dims.size()));
       if (score_size == 3) {
-        PADDLE_ENFORCE_EQ(
-            box_dims[2] == 4 || box_dims[2] == 8 || box_dims[2] == 16 ||
-                box_dims[2] == 24 || box_dims[2] == 32,
-            true, platform::errors::InvalidArgument(
-                      "The last dimension of Input"
-                      "(BBoxes) must be 4 or 8, "
-                      "represents the layout of coordinate "
-                      "[xmin, ymin, xmax, ymax] or "
-                      "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
-                      "8 points: [xi, yi] i= 1,2,...,8 or "
-                      "12 points: [xi, yi] i= 1,2,...,12 or "
-                      "16 points: [xi, yi] i= 1,2,...,16"));
+        PADDLE_ENFORCE_EQ(box_dims[2] == 4 || box_dims[2] == 8 ||
+                              box_dims[2] == 16 || box_dims[2] == 24 ||
+                              box_dims[2] == 32,
+                          true,
+                          platform::errors::InvalidArgument(
+                              "The last dimension of Input"
+                              "(BBoxes) must be 4 or 8, "
+                              "represents the layout of coordinate "
+                              "[xmin, ymin, xmax, ymax] or "
+                              "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
+                              "8 points: [xi, yi] i= 1,2,...,8 or "
+                              "12 points: [xi, yi] i= 1,2,...,12 or "
+                              "16 points: [xi, yi] i= 1,2,...,16"));
         PADDLE_ENFORCE_EQ(
             box_dims[1], score_dims[2],
             platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/detection/nms_op.cc b/paddle/fluid/operators/detection/nms_op.cc
index f6dc44eb5fc..34a92efa68a 100644
--- a/paddle/fluid/operators/detection/nms_op.cc
+++ b/paddle/fluid/operators/detection/nms_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/nms_op.h"
+
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/nms_op.cu b/paddle/fluid/operators/detection/nms_op.cu
index b6027e67d6c..4f62c735c26 100644
--- a/paddle/fluid/operators/detection/nms_op.cu
+++ b/paddle/fluid/operators/detection/nms_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/operators/detection/nms_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
diff --git a/paddle/fluid/operators/detection/nms_util.h b/paddle/fluid/operators/detection/nms_util.h
index 0e448d42fc2..7a6565ac760 100644
--- a/paddle/fluid/operators/detection/nms_util.h
+++ b/paddle/fluid/operators/detection/nms_util.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/operators/detection/poly_util.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/poly_util.cc b/paddle/fluid/operators/detection/poly_util.cc
index 1af2c95c6cf..6aa81bf1b39 100644
--- a/paddle/fluid/operators/detection/poly_util.cc
+++ b/paddle/fluid/operators/detection/poly_util.cc
@@ -16,13 +16,14 @@ limitations under the License. */
 #define POLY_UTIL_CC_
 
 #include "paddle/fluid/operators/detection/poly_util.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
-using gpc::gpc_polygon_clip;
 using gpc::gpc_free_polygon;
+using gpc::gpc_polygon_clip;
 
 template <class T>
 void Array2PointVec(const T*& box, const size_t box_size,
diff --git a/paddle/fluid/operators/detection/poly_util.h b/paddle/fluid/operators/detection/poly_util.h
index f07baf72d9f..cc37f00008d 100644
--- a/paddle/fluid/operators/detection/poly_util.h
+++ b/paddle/fluid/operators/detection/poly_util.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #define POLY_UTIL_H_
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/gpc.h"
 
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
index 4000994beb5..889bc8354bc 100644
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
index bc46ec0b656..4e49a6ed852 100644
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 limitations under the License. */
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -66,23 +67,26 @@ class RetinanetDetectionOutputOp : public framework::OperatorWithKernel {
     auto im_info_dims = ctx->GetInputDim("ImInfo");
 
     const size_t b_n = bboxes_dims.size();
-    PADDLE_ENFORCE_GT(b_n, 0, platform::errors::InvalidArgument(
-                                  "The number of Variables in Input(BBoxes) "
-                                  "should be greater than 0, "
-                                  "but received number is:%d.",
-                                  b_n));
+    PADDLE_ENFORCE_GT(b_n, 0,
+                      platform::errors::InvalidArgument(
+                          "The number of Variables in Input(BBoxes) "
+                          "should be greater than 0, "
+                          "but received number is:%d.",
+                          b_n));
     const size_t s_n = scores_dims.size();
-    PADDLE_ENFORCE_GT(s_n, 0, platform::errors::InvalidArgument(
-                                  "The number of Variables in Input(Scores) "
-                                  "should be greater than 0, "
-                                  "but received number is:%d.",
-                                  s_n));
+    PADDLE_ENFORCE_GT(s_n, 0,
+                      platform::errors::InvalidArgument(
+                          "The number of Variables in Input(Scores) "
+                          "should be greater than 0, "
+                          "but received number is:%d.",
+                          s_n));
     const size_t a_n = anchors_dims.size();
-    PADDLE_ENFORCE_GT(a_n, 0, platform::errors::InvalidArgument(
-                                  "The number of Variables in Input(Anchors) "
-                                  "should be greater than 0, "
-                                  "but received number is:%d.",
-                                  a_n));
+    PADDLE_ENFORCE_GT(a_n, 0,
+                      platform::errors::InvalidArgument(
+                          "The number of Variables in Input(Anchors) "
+                          "should be greater than 0, "
+                          "but received number is:%d.",
+                          a_n));
     auto bbox_dims = bboxes_dims[0];
     auto score_dims = scores_dims[0];
     auto anchor_dims = anchors_dims[0];
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index 353d17a6e09..eb6d6c6db92 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -40,8 +41,8 @@ bool GT(T a, T b) {
 }
 
 /*
-*check if (x, y) is in the boundary of roi
-*/
+ *check if (x, y) is in the boundary of roi
+ */
 template <typename T>
 bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
   for (int i = 0; i < 4; i++) {
@@ -431,10 +432,9 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
               T matrix[9];
               get_transform_matrix<T>(transformed_width, transformed_height,
                                       roi_x, roi_y, matrix);
-              const T* out_grad_ptr = out_grad_data +
-                                      (roi_idx * channels + c) *
-                                          transformed_height *
-                                          transformed_width;
+              const T* out_grad_ptr = out_grad_data + (roi_idx * channels + c) *
+                                                          transformed_height *
+                                                          transformed_width;
               for (int out_h = 0; out_h < transformed_height; ++out_h) {
                 for (int out_w = 0; out_w < transformed_width; ++out_w) {
                   T src_w;
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index 515a4bbac59..1bff79606d4 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 using paddle::platform::float16;
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 
 namespace paddle {
 namespace operators {
@@ -56,8 +57,8 @@ __device__ T min(T a, T b) {
 }
 
 /*
-* check if (x, y) is in the boundary of roi
-*/
+ * check if (x, y) is in the boundary of roi
+ */
 template <typename T>
 __device__ bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
   for (int i = 0; i < 4; i++) {
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index e96c0bbc272..b636decdfbf 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <random>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
index 8526f1762cd..31f3dab81fe 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
index 51829595863..fcb7ec1fbfe 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <limits>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 35e38909017..ae7dfe0dd66 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -36,10 +36,11 @@ class YoloBoxOp : public framework::OperatorWithKernel {
     auto iou_aware = ctx->Attrs().Get<bool>("iou_aware");
     auto iou_aware_factor = ctx->Attrs().Get<float>("iou_aware_factor");
 
-    PADDLE_ENFORCE_EQ(dim_x.size(), 4, platform::errors::InvalidArgument(
-                                           "Input(X) should be a 4-D tensor."
-                                           "But received X dimension(%s)",
-                                           dim_x.size()));
+    PADDLE_ENFORCE_EQ(
+        dim_x.size(), 4,
+        platform::errors::InvalidArgument("Input(X) should be a 4-D tensor."
+                                          "But received X dimension(%s)",
+                                          dim_x.size()));
     if (iou_aware) {
       PADDLE_ENFORCE_EQ(
           dim_x[1], anchor_num * (6 + class_num),
@@ -245,11 +246,10 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     YoloBoxInferShapeFunctor);
 
-REGISTER_OP_VERSION(yolo_box)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(yolo_box).AddCheckpoint(
+    R"ROC(
       Upgrade yolo box to add new attribute [iou_aware, iou_aware_factor].
     )ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr("iou_aware", "Whether use iou aware", false)
-            .NewAttr("iou_aware_factor", "iou aware factor", 0.5f));
+    paddle::framework::compatible::OpVersionDesc()
+        .NewAttr("iou_aware", "Whether use iou aware", false)
+        .NewAttr("iou_aware_factor", "iou aware factor", 0.5f));
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 21044734ca8..2170fd0639f 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -10,6 +10,7 @@
    limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/type_defs.h"
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 588967f0832..aa4695cc975 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection_map_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index 4dd41837f06..a034572a0c4 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc
index 6959b5cf811..ec5a51bbffa 100644
--- a/paddle/fluid/operators/determinant_op.cc
+++ b/paddle/fluid/operators/determinant_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/determinant_op.h"
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/backward.h"
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index 702ff3bfd87..d4c05b631e3 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <cmath>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cc b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
index 85a29271b13..f60380f0475 100644
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cc
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
@@ -10,10 +10,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <string>
-
 #include "paddle/fluid/operators/dgc_clip_by_norm_op.h"
 
+#include <string>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc
index 5fe66fa38a8..95d3f75de9a 100644
--- a/paddle/fluid/operators/dgc_op.cc
+++ b/paddle/fluid/operators/dgc_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dgc_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
index b1bf5e27781..91093f67e05 100644
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
-#include "dgc/dgc.h"
 
+#include "dgc/dgc.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
@@ -118,10 +118,12 @@ class DGCOpKernel : public framework::OpKernel<T> {
         1 - get_period_sparcity(
                 sparsity, static_cast<float>(*current_step - rampup_begin_step),
                 rampup_step);
-    PADDLE_ENFORCE_GE(ratio, 0.0, platform::errors::InvalidArgument(
-                                      "DGC sparsity ratio must >= 0"));
-    PADDLE_ENFORCE_LT(ratio, 1.0, platform::errors::InvalidArgument(
-                                      "DGC sparsity ratio must < 1"));
+    PADDLE_ENFORCE_GE(
+        ratio, 0.0,
+        platform::errors::InvalidArgument("DGC sparsity ratio must >= 0"));
+    PADDLE_ENFORCE_LT(
+        ratio, 1.0,
+        platform::errors::InvalidArgument("DGC sparsity ratio must < 1"));
     int k = static_cast<int>(g->numel() * ratio);
 
     VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov
diff --git a/paddle/fluid/operators/diag_embed_op.cu b/paddle/fluid/operators/diag_embed_op.cu
index 7e3ab6be664..a9d92fdf634 100644
--- a/paddle/fluid/operators/diag_embed_op.cu
+++ b/paddle/fluid/operators/diag_embed_op.cu
@@ -14,6 +14,7 @@
 
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/diag_embed_op.h"
 
diff --git a/paddle/fluid/operators/diag_embed_op.h b/paddle/fluid/operators/diag_embed_op.h
index a5621be3baa..b07047996d5 100644
--- a/paddle/fluid/operators/diag_embed_op.h
+++ b/paddle/fluid/operators/diag_embed_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/for_range.h"
diff --git a/paddle/fluid/operators/dirichlet_op.h b/paddle/fluid/operators/dirichlet_op.h
index 540acad423a..658688816eb 100644
--- a/paddle/fluid/operators/dirichlet_op.h
+++ b/paddle/fluid/operators/dirichlet_op.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cmath>
 #include <random>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/for_range.h"
 
diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc
index 55b24849412..6f897bff75c 100644
--- a/paddle/fluid/operators/dist_op.cc
+++ b/paddle/fluid/operators/dist_op.cc
@@ -14,6 +14,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.h b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
index 6b2622366fe..857f295326b 100644
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op.h
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #pragma once
+#include <assert.h>
 #include <cuda.h>          // NOTLINT
 #include <cuda_runtime.h>  // NOTLINT
 #include <dlnne.h>         // NOTLINT
 
-#include <assert.h>
 #include <ctime>
 #include <fstream>
 #include <iostream>
@@ -128,11 +128,13 @@ class DlnneEngineOp : public framework::OperatorBase {
              << ".onnx";
 
     builder = dl::nne::CreateInferBuilder();
-    PADDLE_ENFORCE_NE(builder, nullptr, platform::errors::Unavailable(
-                                            "nne create builder failed"));
+    PADDLE_ENFORCE_NE(
+        builder, nullptr,
+        platform::errors::Unavailable("nne create builder failed"));
     parser = dl::nne::CreateParser();
-    PADDLE_ENFORCE_NE(parser, nullptr, platform::errors::Unavailable(
-                                           "nne create parser failed"));
+    PADDLE_ENFORCE_NE(
+        parser, nullptr,
+        platform::errors::Unavailable("nne create parser failed"));
 
     network = builder->CreateNetwork();
 
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
index 611366f6c5b..8e1d7fe5d81 100644
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/dlnne/dlnne_engine_op.h"
+
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_desc.h"
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 482f88b73e6..c40f6c0bbae 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -19,11 +19,13 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <curand_kernel.h>
+
 #include "paddle/fluid/platform/dynload/curand.h"
 #endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
+
 #include "paddle/fluid/platform/dynload/hiprand.h"
 #endif
 
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 8d033ea3194..9426efa4942 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/binary.h"
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
index 851f26ee0e7..24de99d6d8f 100644
--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc
index db8a107290e..8127895569f 100644
--- a/paddle/fluid/operators/edit_distance_op.cc
+++ b/paddle/fluid/operators/edit_distance_op.cc
@@ -37,12 +37,13 @@ class EditDistanceOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           hyp_dims.size() == 2 && ref_dims.size() == 2 &&
               hyp_dims[0] == ref_dims[0],
-          true, platform::errors::InvalidArgument(
-                    "Input(Hyps) and Input(Refs) must be 2-D Tensors with "
-                    "identical first dimension. But received Input(Hyps): "
-                    "input rank %u, input shape [%s]; received Input(Refs): "
-                    "input rank %u, input shape [%s]",
-                    hyp_dims.size(), hyp_dims, ref_dims.size(), ref_dims));
+          true,
+          platform::errors::InvalidArgument(
+              "Input(Hyps) and Input(Refs) must be 2-D Tensors with "
+              "identical first dimension. But received Input(Hyps): "
+              "input rank %u, input shape [%s]; received Input(Refs): "
+              "input rank %u, input shape [%s]",
+              hyp_dims.size(), hyp_dims, ref_dims.size(), ref_dims));
       PADDLE_ENFORCE_EQ(
           hyp_length_dims[0] == ref_length_dims[0] &&
               hyp_length_dims[0] == hyp_dims[0],
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index 49ac7183ff3..eb208c559ce 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/edit_distance_op.h"
diff --git a/paddle/fluid/operators/edit_distance_op.h b/paddle/fluid/operators/edit_distance_op.h
index ef290c2eff2..101e3a90b80 100644
--- a/paddle/fluid/operators/edit_distance_op.h
+++ b/paddle/fluid/operators/edit_distance_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/eig_op.cc b/paddle/fluid/operators/eig_op.cc
index 6f1737dba81..5239248d82f 100644
--- a/paddle/fluid/operators/eig_op.cc
+++ b/paddle/fluid/operators/eig_op.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/eig_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -32,10 +34,11 @@ class EigOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     int rank = x_dims.size();
-    PADDLE_ENFORCE_GE(rank, 2, platform::errors::InvalidArgument(
-                                   "Expects input tensor x to be not less than "
-                                   "2 dimentions, but got dimention %d",
-                                   rank));
+    PADDLE_ENFORCE_GE(rank, 2,
+                      platform::errors::InvalidArgument(
+                          "Expects input tensor x to be not less than "
+                          "2 dimentions, but got dimention %d",
+                          rank));
     PADDLE_ENFORCE_EQ(x_dims[rank - 2], x_dims[rank - 1],
                       platform::errors::InvalidArgument(
                           "The input matrix must be a square matrix, "
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index fe898a6c41c..0f9afae8267 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -15,8 +15,10 @@
 #pragma once
 
 #include <math.h>
+
 #include <algorithm>
 #include <complex>
+
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
diff --git a/paddle/fluid/operators/eigvals_op.cc b/paddle/fluid/operators/eigvals_op.cc
index 2ef591dd26a..177dc684662 100644
--- a/paddle/fluid/operators/eigvals_op.cc
+++ b/paddle/fluid/operators/eigvals_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/eigvals_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h
index 4627acc0d07..d75b33e0857 100644
--- a/paddle/fluid/operators/eigvals_op.h
+++ b/paddle/fluid/operators/eigvals_op.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -71,14 +72,16 @@ static void SpiltBatchSquareMatrix(const Tensor& input,
 }
 
 static void CheckLapackEigResult(const int info, const std::string& name) {
-  PADDLE_ENFORCE_LE(info, 0, platform::errors::PreconditionNotMet(
-                                 "The QR algorithm failed to compute all the "
-                                 "eigenvalues in function %s.",
-                                 name.c_str()));
+  PADDLE_ENFORCE_LE(info, 0,
+                    platform::errors::PreconditionNotMet(
+                        "The QR algorithm failed to compute all the "
+                        "eigenvalues in function %s.",
+                        name.c_str()));
   PADDLE_ENFORCE_GE(
-      info, 0, platform::errors::InvalidArgument(
-                   "The %d-th argument has an illegal value in function %s.",
-                   -info, name.c_str()));
+      info, 0,
+      platform::errors::InvalidArgument(
+          "The %d-th argument has an illegal value in function %s.", -info,
+          name.c_str()));
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc
index 6da0045443c..7fc19d6913f 100644
--- a/paddle/fluid/operators/einsum_op.cc
+++ b/paddle/fluid/operators/einsum_op.cc
@@ -14,6 +14,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
index 22a5de4c609..9c1a84ba8b6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <memory>
 #include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 13fd9b81a87..e0523a26ee3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index e9adb9abdb5..b3363862d5f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc b/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc
index e003a43b5c5..ebdebb2f485 100644
--- a/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h
index ff1e12103be..8c230c5f47b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
@@ -16,6 +16,7 @@
 
 #ifdef PADDLE_WITH_MLU
 #include <vector>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc
index 156589384c0..19d28301ffb 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc
@@ -15,11 +15,11 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-
 #include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index 45b6f7cb391..253014a7981 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/complex.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index e2dd0e36d40..39045bf0d59 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/cpu_info.h"
-
 #include "paddle/phi/kernels/elementwise_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 80b07721f0b..476b891bb41 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/transform.h"
-
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
@@ -60,14 +59,14 @@ namespace paddle {
 namespace operators {
 
 /*
-*  Pack input and output tensors into respective vectors with
-*  consideration of varible X`s class type.
-*  Input variable X is supported to be whether LoDTensor or
-*  SelectedRows class type in this package function, once X
-*  was SelectedRows type, a valid pointer x_for_selectedrows
-*  is excepted to be passed in from op kernel for acquisition
-*  of the valid address of LoDTensor created ahead in the function.
-*/
+ *  Pack input and output tensors into respective vectors with
+ *  consideration of varible X`s class type.
+ *  Input variable X is supported to be whether LoDTensor or
+ *  SelectedRows class type in this package function, once X
+ *  was SelectedRows type, a valid pointer x_for_selectedrows
+ *  is excepted to be passed in from op kernel for acquisition
+ *  of the valid address of LoDTensor created ahead in the function.
+ */
 template <typename OutT>
 int PackTensorsIntoVector(const framework::ExecutionContext &ctx,
                           std::vector<const framework::Tensor *> *ins,
@@ -327,10 +326,11 @@ static void FusedElemwiseAndActBroadcast1CUDA(gpuStream_t stream, const T *x,
                                               T *intermediate_out) {
   int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, w);
   int gird_size = h;
-  FusedElemwiseAndActBroadcast1CUDAKernel<
-      T, CompoundFunctor, BcastY, KeepIntermediateOut,
-      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
-      x, y, h, w, compound_functor, out, intermediate_out);
+  FusedElemwiseAndActBroadcast1CUDAKernel<T, CompoundFunctor, BcastY,
+                                          KeepIntermediateOut,
+                                          SameShapeOfIntermediateOutAndOut>
+      <<<gird_size, block_size, 0, stream>>>(x, y, h, w, compound_functor, out,
+                                             intermediate_out);
 }
 
 template <typename T, typename CompoundFunctor, bool BcastY,
@@ -385,10 +385,11 @@ static void FusedElemwiseAndActBroadcast2CUDA(gpuStream_t stream, const T *x,
   int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
   int gird_size = n;
 
-  FusedElemwiseAndActBroadcast2CUDAKernel<
-      T, CompoundFunctor, BcastY, KeepIntermediateOut,
-      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
-      x, y, compound_functor, pre, n, post, out, intermediate_out);
+  FusedElemwiseAndActBroadcast2CUDAKernel<T, CompoundFunctor, BcastY,
+                                          KeepIntermediateOut,
+                                          SameShapeOfIntermediateOutAndOut>
+      <<<gird_size, block_size, 0, stream>>>(x, y, compound_functor, pre, n,
+                                             post, out, intermediate_out);
 }
 
 #endif
@@ -544,8 +545,9 @@ void FusedElemwiseAndActGradComputeNoBroadcast(
       out->data<T>(), dout->data<T>(), dx_op, dy_op, dintermediate_op,
       dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
       dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-      dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                               ctx.GetPlace())});
+      dintermediate == nullptr
+          ? nullptr
+          : dintermediate->mutable_data<T>(ctx.GetPlace())});
 }
 
 template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
@@ -605,12 +607,11 @@ static void FusedElemwiseAndActGradBroadcast1CPU(
         }
       }
       if (d_intermediate != nullptr) {
-        T tmp = UseIntermediateOut
-                    ? dintermediate_op.UseIntermediateOut(
-                          x_val, intermediate_out[tmp_out_idx], out[offset],
-                          dout[offset])
-                    : dintermediate_op.Recompute(x_val, y_val, out[offset],
-                                                 dout[i]);
+        T tmp = UseIntermediateOut ? dintermediate_op.UseIntermediateOut(
+                                         x_val, intermediate_out[tmp_out_idx],
+                                         out[offset], dout[offset])
+                                   : dintermediate_op.Recompute(
+                                         x_val, y_val, out[offset], dout[i]);
         if (SameShapeOfIntermediateOutAndOut) {
           d_intermediate[tmp_out_idx] = tmp;
         } else {
@@ -686,12 +687,11 @@ static void FusedElemwiseAndActGradBroadcast2CPU(
           }
         }
         if (d_intermediate != nullptr) {
-          T tmp = UseIntermediateOut
-                      ? dintermediate_op.UseIntermediateOut(
-                            x_val, intermediate_out[tmp_out_idx], out[offset],
-                            dout[offset])
-                      : dintermediate_op.Recompute(x_val, y_val, out[offset],
-                                                   dout[i]);
+          T tmp = UseIntermediateOut ? dintermediate_op.UseIntermediateOut(
+                                           x_val, intermediate_out[tmp_out_idx],
+                                           out[offset], dout[offset])
+                                     : dintermediate_op.Recompute(
+                                           x_val, y_val, out[offset], dout[i]);
           if (SameShapeOfIntermediateOutAndOut) {
             d_intermediate[tmp_out_idx] = tmp;
           } else {
@@ -835,11 +835,12 @@ static void FusedElemwiseAndActGradBroadcast1CUDA(
   int theory_block = (w + BLOCK_X - 1) / BLOCK_X;
   dim3 grids(std::min(theory_block, max_blocks));
 
-  FusedElemwiseAndActGradBroadcast1CUDAKernel<
-      T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, BcastY,
-      SameShapeOfIntermediateOutAndOut><<<grids, blocks, 0, stream>>>(
-      x, y, intermediate_out, out, dout, h, w, dx_op, dy_op, dintermediate_op,
-      dx, dy, d_intermediate);
+  FusedElemwiseAndActGradBroadcast1CUDAKernel<T, DX_OP, DY_OP, DIntermediate_OP,
+                                              UseIntermediateOut, BcastY,
+                                              SameShapeOfIntermediateOutAndOut>
+      <<<grids, blocks, 0, stream>>>(x, y, intermediate_out, out, dout, h, w,
+                                     dx_op, dy_op, dintermediate_op, dx, dy,
+                                     d_intermediate);
 }
 
 template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
@@ -899,12 +900,11 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(
       }
     }
     if (d_intermediate != nullptr) {
-      T tmp = UseIntermediateOut
-                  ? dintermediate_op.UseIntermediateOut(
-                        y_val, intermediate_out[tmp_out_idx], out[offset],
-                        dout[offset])
-                  : dintermediate_op.Recompute(x_val, y_val, out[offset],
-                                               dout[offset]);
+      T tmp = UseIntermediateOut ? dintermediate_op.UseIntermediateOut(
+                                       y_val, intermediate_out[tmp_out_idx],
+                                       out[offset], dout[offset])
+                                 : dintermediate_op.Recompute(
+                                       x_val, y_val, out[offset], dout[offset]);
       if (SameShapeOfIntermediateOutAndOut) {
         d_intermediate[tmp_out_idx] = tmp;
       } else {
@@ -951,11 +951,12 @@ static void FusedElemwiseAndActGradBroadcast2CUDA(
     T *dintermediate) {
   int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
   int gird_size = n;
-  FusedElemwiseAndActGradBroadcast2CUDAKernel<
-      T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, BcastY,
-      SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
-      x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op,
-      dintermediate_op, dx, dy, dintermediate);
+  FusedElemwiseAndActGradBroadcast2CUDAKernel<T, DX_OP, DY_OP, DIntermediate_OP,
+                                              UseIntermediateOut, BcastY,
+                                              SameShapeOfIntermediateOutAndOut>
+      <<<gird_size, block_size, 0, stream>>>(
+          x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op,
+          dintermediate_op, dx, dy, dintermediate);
 }
 #endif
 
@@ -995,8 +996,9 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
           out->data<T>(), dout->data<T>(), h, w, dx_op, dy_op, dintermediate_op,
           dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
           dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace()));
+          dintermediate == nullptr
+              ? nullptr
+              : dintermediate->mutable_data<T>(ctx.GetPlace()));
 #endif
     } else {
       FusedElemwiseAndActGradBroadcast1CPU<T, DX_OP, DY_OP, DIntermediate_OP,
@@ -1007,8 +1009,9 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
           out->data<T>(), dout->data<T>(), h, w, dx_op, dy_op, dintermediate_op,
           dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
           dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace()));
+          dintermediate == nullptr
+              ? nullptr
+              : dintermediate->mutable_data<T>(ctx.GetPlace()));
     }
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
@@ -1022,8 +1025,9 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
           dintermediate_op,
           dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
           dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace()));
+          dintermediate == nullptr
+              ? nullptr
+              : dintermediate->mutable_data<T>(ctx.GetPlace()));
 #endif
     } else {
       FusedElemwiseAndActGradBroadcast2CPU<T, DX_OP, DY_OP, DIntermediate_OP,
@@ -1035,8 +1039,9 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
           dintermediate_op,
           dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
           dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
-          dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
-                                                   ctx.GetPlace()));
+          dintermediate == nullptr
+              ? nullptr
+              : dintermediate->mutable_data<T>(ctx.GetPlace()));
     }
   }
 }
diff --git a/paddle/fluid/operators/elementwise/elementwise_xpu.h b/paddle/fluid/operators/elementwise/elementwise_xpu.h
index db5c94b9d1a..3f38450581e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_xpu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <tuple>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
 #include "xpu/refactor/math.h"
@@ -32,8 +33,9 @@ void XPUElementwise(
                       const std::vector<int>&, const std::vector<int>&)>
         func) {
   auto x_var = ctx.InputVar("X");
-  PADDLE_ENFORCE_NE(x_var, nullptr, platform::errors::InvalidArgument(
-                                        "Cannot get input Variable X"));
+  PADDLE_ENFORCE_NE(
+      x_var, nullptr,
+      platform::errors::InvalidArgument("Cannot get input Variable X"));
   PADDLE_ENFORCE_EQ(
       x_var->IsType<framework::LoDTensor>(), true,
       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
index 3cecc52a3c4..f647bd91d5f 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
@@ -18,6 +18,7 @@
 #include <random>
 #include <string>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -72,11 +73,12 @@ class TestElementwiseDivGradGradWithoutDout
 
   std::unique_ptr<framework::OperatorBase> CreateTestOp() override {
     auto op = framework::OpRegistry::CreateOp(
-        this->op_type_, {{"Y", {"Y"}},
-                         {"Out", {"Out"}},
-                         {"DDX", {"DDX"}},
-                         {"DDY", {"DDY"}},
-                         {"DX", {"DX"}}},
+        this->op_type_,
+        {{"Y", {"Y"}},
+         {"Out", {"Out"}},
+         {"DDX", {"DDX"}},
+         {"DDY", {"DDY"}},
+         {"DX", {"DX"}}},
         {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}},
         {{"use_mkldnn", false}, {"axis", 0}});
     return op;
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
index 05f87e5465a..7defe4e5793 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
+++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
@@ -21,6 +21,7 @@
 #include <random>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index 9e0e4e7fe1c..0f6c308b211 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index 093c4d8f793..cace8b5fdff 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_as_op.h"
+
 #include <memory>
 #include <vector>
 
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
old mode 100755
new mode 100644
index 9361edd43bf..8cdab4c5e1a
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -10,8 +10,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_as_v2_op.h"
+
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/infermeta/binary.h"
@@ -107,7 +109,6 @@ REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp,
                   ops::ExpandAsV2GradNoNeedBufVarsInferer);
 
 REGISTER_OP_VERSION(expand_as_v2)
-    .AddCheckpoint(
-        R"ROC(fix expand_as_v2 and add new input [Y])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewInput(
-            "Y", "Expand X according to the shape of Y"));
+    .AddCheckpoint(R"ROC(fix expand_as_v2 and add new input [Y])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewInput(
+                       "Y", "Expand X according to the shape of Y"));
diff --git a/paddle/fluid/operators/expand_as_v2_op_npu.cc b/paddle/fluid/operators/expand_as_v2_op_npu.cc
index 67d95e12400..28fd922d77b 100644
--- a/paddle/fluid/operators/expand_as_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_npu.cc
@@ -30,10 +30,11 @@ class ExpandAsV2NPUKernel : public framework::OpKernel<T> {
                           "expand_as_v2 op must be greater than or equal to "
                           "the rank (%d) of the input 'x'.",
                           target_rank, rank));
-    PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument(
-                                   "The rank (%d) of the input 'x' for "
-                                   "expand_as_v2 op must be positive.",
-                                   rank));
+    PADDLE_ENFORCE_GE(
+        rank, 1,
+        platform::errors::InvalidArgument("The rank (%d) of the input 'x' for "
+                                          "expand_as_v2 op must be positive.",
+                                          rank));
     PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED,
                       platform::errors::InvalidArgument(
                           "The rank (%d) of the input 'target_tensor' for "
diff --git a/paddle/fluid/operators/expand_as_v2_op_xpu.cc b/paddle/fluid/operators/expand_as_v2_op_xpu.cc
index 0912b280aa6..fc3d77f3cc8 100644
--- a/paddle/fluid/operators/expand_as_v2_op_xpu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_xpu.cc
@@ -33,10 +33,11 @@ class ExpandAsV2XPUKernel : public framework::OpKernel<T> {
                           "expand_as_v2 op must be greater than or equal to "
                           "the rank (%d) of the input 'x'.",
                           target_rank, rank));
-    PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument(
-                                   "The rank (%d) of the input 'x' for "
-                                   "expand_as_v2 op must be positive.",
-                                   rank));
+    PADDLE_ENFORCE_GE(
+        rank, 1,
+        platform::errors::InvalidArgument("The rank (%d) of the input 'x' for "
+                                          "expand_as_v2 op must be positive.",
+                                          rank));
     PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED,
                       platform::errors::InvalidArgument(
                           "The rank (%d) of the input 'target_tensor' for "
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index e45761112d4..04cdbd5a606 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 05cd893b057..880adad743f 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -213,12 +213,13 @@ class ExpandGradKernel : public framework::OpKernel<T> {
       framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
                             out0);
     } else {
-      PADDLE_ENFORCE_GE(dims, 1, platform::errors::InvalidArgument(
-                                     "The number of dimensions of the input "
-                                     "'Out@GRAD' for Op(expand_grad)"
-                                     " must be greater than or equal to 1, but "
-                                     "the value received is %d.",
-                                     dims));
+      PADDLE_ENFORCE_GE(dims, 1,
+                        platform::errors::InvalidArgument(
+                            "The number of dimensions of the input "
+                            "'Out@GRAD' for Op(expand_grad)"
+                            " must be greater than or equal to 1, but "
+                            "the value received is %d.",
+                            dims));
       PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
                         platform::errors::InvalidArgument(
                             "The number of dimensions of the input 'Out@GRAD' "
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index 292f706cb18..6aeea745911 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_v2_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
index c9fe19fd091..c64bdabf599 100644
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/expand_v2_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_v2_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/expand_v2_op_xpu.cc b/paddle/fluid/operators/expand_v2_op_xpu.cc
index cb2165c4e92..3d010c964bc 100644
--- a/paddle/fluid/operators/expand_v2_op_xpu.cc
+++ b/paddle/fluid/operators/expand_v2_op_xpu.cc
@@ -13,8 +13,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/expand_v2_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_v2_op.h"
 
 namespace paddle {
 namespace operators {
@@ -110,10 +110,11 @@ class ExpandV2XPUKernel : public framework::OpKernel<T> {
       r = xpu::broadcast<XPUType>(dev_ctx.x_context(), x_data, out_data,
                                   x_shape, out_shape);
     }
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                          "XPU API(broadcast) return wrong "
-                                          "value[%d %s] in ExpandV2XPUKernel.",
-                                          r, XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(broadcast) return wrong "
+                                   "value[%d %s] in ExpandV2XPUKernel.",
+                                   r, XPUAPIErrorMsg[r]));
   }
 };
 
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index 8172f441e64..5a3a1cf53de 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fake_dequantize_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu.h b/paddle/fluid/operators/fake_dequantize_op.cu.h
index 9859dd4607c..50f772ec45d 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu.h
+++ b/paddle/fluid/operators/fake_dequantize_op.cu.h
@@ -119,10 +119,10 @@ struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
         quant_stride *= in_dims[i];
       }
 
-      DequantizeOneScaleQuantAxisN<
-          T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-          in_data, scale_factor, max_range, num, in_dims[quant_axis],
-          quant_stride, out_data);
+      DequantizeOneScaleQuantAxisN<T>
+          <<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+              in_data, scale_factor, max_range, num, in_dims[quant_axis],
+              quant_stride, out_data);
     } else if (scale_num == 2) {
       // Not need to consider quant_axis
       int num = in->numel();
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
index aad2c2c7d98..e623a638922 100644
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index ac72f23d46e..855c78d2998 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fake_quantize_op.h"
+
 #include <algorithm>
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/transform.h"
@@ -832,7 +834,7 @@ REGISTER_OP_VERSION(moving_average_abs_max_scale)
             "Delete output in order to make the inference model not "
             "save moving_average_abs_max_scale operator. This will "
             "make the quantitative model be correctly applied in inference."))
-    .AddCheckpoint(
-        R"ROC(Incompatible upgrade of output [Out])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewOutput(
-            "Out", "In order to support dygraph qat, add output again."));
+    .AddCheckpoint(R"ROC(Incompatible upgrade of output [Out])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewOutput(
+                       "Out",
+                       "In order to support dygraph qat, add output again."));
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index a6130c272d7..580521183cb 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #endif  // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
 
 #include <string>
+
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/fake_quantize_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -80,10 +81,10 @@ struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
 
     framework::Tensor max;
     T* max_data = max.mutable_data<T>(phi::make_ddim({grid}), ctx.GetPlace());
-    FindAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
-        in, num, max_data);
-    FindAbsMaxKernel<T><<<1, block, 1024 * sizeof(T), ctx.stream()>>>(
-        max_data, grid, out);
+    FindAbsMaxKernel<T>
+        <<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(in, num, max_data);
+    FindAbsMaxKernel<T>
+        <<<1, block, 1024 * sizeof(T), ctx.stream()>>>(max_data, grid, out);
   }
 };
 
@@ -176,9 +177,9 @@ struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
       int cout = in_dims[0];
       int grid = cout;
       int block = 1024;
-      FindChannelAbsMaxKernelQuantAxis0<
-          T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
-          in_data, num, cout, out_abs_max);
+      FindChannelAbsMaxKernelQuantAxis0<T>
+          <<<grid, block, block * sizeof(T), ctx.stream()>>>(in_data, num, cout,
+                                                             out_abs_max);
     } else if (quant_axis == 1) {
       int cin = in_dims[0];
       int cout = in_dims[1];
@@ -193,17 +194,17 @@ struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
 
       for (int i = 0; i < cin / max_threads; i++) {
         int block = max_threads;
-        FindChannelAbsMaxKernelQuantAxis1<
-            T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
-            in_data, num, cin, cout, out_abs_max);
+        FindChannelAbsMaxKernelQuantAxis1<T>
+            <<<grid, block, block * sizeof(T), ctx.stream()>>>(
+                in_data, num, cin, cout, out_abs_max);
         in_data += num / cin;
       }
 
       int block = cin % max_threads;
       if (block > 0) {
-        FindChannelAbsMaxKernelQuantAxis1<
-            T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
-            in_data, num, in_dims[0], in_dims[1], out_abs_max);
+        FindChannelAbsMaxKernelQuantAxis1<T>
+            <<<grid, block, block * sizeof(T), ctx.stream()>>>(
+                in_data, num, in_dims[0], in_dims[1], out_abs_max);
       }
     }
   }
@@ -549,16 +550,16 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
     if (quant_axis == 0) {
       int grid = in_dims[0];
       int block = 1024;
-      ChannelClipAndQuantDequantKernelQuantAxis0<
-          T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
-                                               num, in_dims[0], out_data);
+      ChannelClipAndQuantDequantKernelQuantAxis0<T>
+          <<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt, num,
+                                             in_dims[0], out_data);
     } else if (quant_axis == 1) {
       int grid = in_dims[0] * in_dims[1];
       int block = 1024;
 
-      ChannelClipAndQuantDequantKernelQuantAxis1<
-          T><<<grid, block, 0, ctx.stream()>>>(
-          in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
+      ChannelClipAndQuantDequantKernelQuantAxis1<T>
+          <<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt, num,
+                                             in_dims[0], in_dims[1], out_data);
     }
   }
 };
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index dc3f081cc9e..182db11ed84 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 6e646f0d4bf..68ef8f3c2be 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fc_op.h"
+
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 47c71286035..1c76c2c36b8 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
 
diff --git a/paddle/fluid/operators/fill_any_like_op_xpu.cc b/paddle/fluid/operators/fill_any_like_op_xpu.cc
index ec4ba6e926c..a07fbe5a7a5 100644
--- a/paddle/fluid/operators/fill_any_like_op_xpu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_xpu.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/phi/kernels/full_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 07593a70f05..d6726b99813 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fill_constant_op.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index edd8613ba52..a121eb8cc84 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -84,9 +84,10 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
       const auto &dev_ctx =
           ctx.template device_context<paddle::platform::NPUDeviceContext>();
       auto op_func = [&shape, &value](
-          const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
-          const NPUAttributeMap &attrs,
-          const platform::NPUDeviceContext &dev_ctx) {
+                         const std::vector<Tensor> &inputs,
+                         const std::vector<Tensor> &outputs,
+                         const NPUAttributeMap &attrs,
+                         const platform::NPUDeviceContext &dev_ctx) {
         Tensor tensor_value;
         tensor_value.mutable_data<uint8_t>({1}, dev_ctx.GetPlace());
         FillNpuTensorWithConstant<uint8_t>(&tensor_value,
diff --git a/paddle/fluid/operators/fill_diagonal_tensor_op.h b/paddle/fluid/operators/fill_diagonal_tensor_op.h
index ebb980b66af..5bee72f5268 100644
--- a/paddle/fluid/operators/fill_diagonal_tensor_op.h
+++ b/paddle/fluid/operators/fill_diagonal_tensor_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index 521ddd4ec12..e934b794f8b 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fill_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fill_op.h b/paddle/fluid/operators/fill_op.h
index c5cbffbf5c6..7f7e0f2b31a 100644
--- a/paddle/fluid/operators/fill_op.h
+++ b/paddle/fluid/operators/fill_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <vector>
 
-#include <algorithm>
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
index 2d340829332..518d8414c50 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fill_zeros_like_op.h"
+
 #include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
index 4cb0887c1f3..91809b8cd11 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fill_zeros_like_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/fluid/operators/filter_by_instag_op.cc b/paddle/fluid/operators/filter_by_instag_op.cc
index 02ea2d59ae3..cb1e3083320 100644
--- a/paddle/fluid/operators/filter_by_instag_op.cc
+++ b/paddle/fluid/operators/filter_by_instag_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/operators/filter_by_instag_op.h"
 
 #include <memory>
+
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/var_type_inference.h"
 
diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
index 7870efba4e7..75680a61b30 100644
--- a/paddle/fluid/operators/filter_by_instag_op.cu
+++ b/paddle/fluid/operators/filter_by_instag_op.cu
@@ -20,6 +20,7 @@
 
 #include <thrust/copy.h>
 #include <thrust/device_vector.h>
+
 #include <cstring>
 #include <random>
 #include <string>
@@ -30,11 +31,10 @@
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/filter_by_instag_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#include "paddle/fluid/operators/filter_by_instag_op.h"
-
 #if defined(PADDLE_WITH_CUDA)
 namespace cg = cooperative_groups;
 #endif
@@ -277,7 +277,7 @@ __global__ void filter_copy_fuse_kernel(
         T* dst = out_data + output_start_idx * x1_embed_size;
         const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size;
         const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size;
-        for (const T *j = src_start; j != src_end; dst++, j++) {
+        for (const T* j = src_start; j != src_end; dst++, j++) {
           *dst = *j;
         }
       }
@@ -306,7 +306,7 @@ __global__ void copy_grad_kernel(const size_t N, const int ins_per_thread,
     const T* src_end =
         out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size;
 
-    for (const T *j = src_start; j != src_end; dst++, j++) {
+    for (const T* j = src_start; j != src_end; dst++, j++) {
       *dst = *j;
     }
   }
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index 3abc980ceaa..6172fef9b4b 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/mixed_vector.h"
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index d1ac573b844..2e767c37051 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/flatten_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index cacd30cad8a..6a91cd8b941 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc
index e1ee1a86a2f..b00cbf5c4fc 100644
--- a/paddle/fluid/operators/flip_op.cc
+++ b/paddle/fluid/operators/flip_op.cc
@@ -93,10 +93,9 @@ REGISTER_OPERATOR(flip, ops::FlipOp, ops::FlipOpMaker, ops::FlipOpInferVarType,
                   FlipInferShapeFunctor);
 
 /* ==========================  register checkpoint ===========================*/
-REGISTER_OP_VERSION(flip)
-    .AddCheckpoint(
-        R"ROC(Upgrade flip, add new attr [axis] and delete attr [dims].)ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr("axis", "The added attr 'axis' doesn't set default value.",
-                     paddle::none)
-            .DeleteAttr("dims", "The attr 'dims' is deleted."));
+REGISTER_OP_VERSION(flip).AddCheckpoint(
+    R"ROC(Upgrade flip, add new attr [axis] and delete attr [dims].)ROC",
+    paddle::framework::compatible::OpVersionDesc()
+        .NewAttr("axis", "The added attr 'axis' doesn't set default value.",
+                 paddle::none)
+        .DeleteAttr("dims", "The attr 'dims' is deleted."));
diff --git a/paddle/fluid/operators/fold_op.h b/paddle/fluid/operators/fold_op.h
index c0aa47a0b4f..fd1a7558b71 100644
--- a/paddle/fluid/operators/fold_op.h
+++ b/paddle/fluid/operators/fold_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/frame_op.cc b/paddle/fluid/operators/frame_op.cc
index 2ff9beb36f2..00c98cae10e 100644
--- a/paddle/fluid/operators/frame_op.cc
+++ b/paddle/fluid/operators/frame_op.cc
@@ -33,10 +33,11 @@ class FrameOp : public framework::OperatorWithKernel {
     const int x_rank = x_dims.size();
 
     PADDLE_ENFORCE_GE(
-        x_rank, 1, platform::errors::InvalidArgument(
-                       "Input(X) of FrameOp should be a tensor which contains "
-                       "at least 1 dimension, but got rank %s.",
-                       x_rank));
+        x_rank, 1,
+        platform::errors::InvalidArgument(
+            "Input(X) of FrameOp should be a tensor which contains "
+            "at least 1 dimension, but got rank %s.",
+            x_rank));
     PADDLE_ENFORCE_GT(hop_length, 0,
                       platform::errors::InvalidArgument(
                           "Attribute(hop_length) of FrameOp should be greater "
diff --git a/paddle/fluid/operators/fsp_op.cc b/paddle/fluid/operators/fsp_op.cc
index f00ec6a1e14..16ce2b43bf4 100644
--- a/paddle/fluid/operators/fsp_op.cc
+++ b/paddle/fluid/operators/fsp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fsp_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/attention_layer_norm.h b/paddle/fluid/operators/fused/attention_layer_norm.h
index 43491a9faf1..b960b835979 100644
--- a/paddle/fluid/operators/fused/attention_layer_norm.h
+++ b/paddle/fluid/operators/fused/attention_layer_norm.h
@@ -38,11 +38,10 @@ class AttnLayerNorm {
     auto stream = dev_ctx_.stream();
 
     switch (GetDesiredBlockDim(feature_size_)) {
-      FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<T, LayerNormParamType<T>,
-                           kBlockDim><<<batch_size_, kBlockDim, 0, stream>>>(
-              x_data, scale_data, bias_data, y_data, mean_data, var_data,
-              epsilon_, feature_size_));
+      FIXED_BLOCK_DIM_CASE(LayerNormForward<T, LayerNormParamType<T>, kBlockDim>
+                           <<<batch_size_, kBlockDim, 0, stream>>>(
+                               x_data, scale_data, bias_data, y_data, mean_data,
+                               var_data, epsilon_, feature_size_));
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Feature_size must be larger than 1"));
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index b059223eaf6..feac0f79530 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -120,24 +120,24 @@ void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
   auto stream = ctx.stream();
   switch (vec_size) {
     case 4: {
-      BroadcastKernelBinary<T, T, 2, 4,
-                            data_per_thread><<<blocks, threads, 0, stream>>>(
-          in0, in1, out, use_broadcast, numel, configlists, main_tid, tail_tid,
-          func);
+      BroadcastKernelBinary<T, T, 2, 4, data_per_thread>
+          <<<blocks, threads, 0, stream>>>(in0, in1, out, use_broadcast, numel,
+                                           configlists, main_tid, tail_tid,
+                                           func);
       break;
     }
     case 2: {
-      BroadcastKernelBinary<T, T, 2, 2,
-                            data_per_thread><<<blocks, threads, 0, stream>>>(
-          in0, in1, out, use_broadcast, numel, configlists, main_tid, tail_tid,
-          func);
+      BroadcastKernelBinary<T, T, 2, 2, data_per_thread>
+          <<<blocks, threads, 0, stream>>>(in0, in1, out, use_broadcast, numel,
+                                           configlists, main_tid, tail_tid,
+                                           func);
       break;
     }
     case 1: {
-      BroadcastKernelBinary<T, T, 2, 1,
-                            data_per_thread><<<blocks, threads, 0, stream>>>(
-          in0, in1, out, use_broadcast, numel, configlists, main_tid, tail_tid,
-          func);
+      BroadcastKernelBinary<T, T, 2, 1, data_per_thread>
+          <<<blocks, threads, 0, stream>>>(in0, in1, out, use_broadcast, numel,
+                                           configlists, main_tid, tail_tid,
+                                           func);
       break;
     }
     default: {
@@ -176,8 +176,8 @@ void Launch1DColumnReduce(gpuStream_t stream, const int max_threads,
   const int block = 256;
   const int max_blocks = std::max(max_threads / block, 1);
   const int grid = std::min(left_num, max_blocks);
-  Compute1DColumnReduceKernel<T, block><<<grid, block, 0, stream>>>(
-      reduce_num, left_num, d_out, d_bias);
+  Compute1DColumnReduceKernel<T, block>
+      <<<grid, block, 0, stream>>>(reduce_num, left_num, d_out, d_bias);
 }
 
 void SetConfigForColumnReduce(const int max_threads, const int reduce_num,
@@ -273,8 +273,8 @@ void Launch2DColumnReduce(const platform::CUDADeviceContext& dev_ctx,
   const auto& stream = dev_ctx.stream();
 
   if (!should_reduce_again) {
-    BiasAddBwSinglePassKernel<T><<<grid, block, 0, stream>>>(d_out, reduce_num,
-                                                             left_num, d_bias);
+    BiasAddBwSinglePassKernel<T>
+        <<<grid, block, 0, stream>>>(d_out, reduce_num, left_num, d_bias);
   } else {
     framework::Tensor tmp_sum;
     tmp_sum.Resize({grid.y, left_num});
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index 304aad16ad0..a85b2f99bb1 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -14,12 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc
index 671e94061cb..490d92880c9 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cc
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 8191c85f2a1..9ca9f8aaf74 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <array>
+
 #include "paddle/fluid/framework/conv_search_cache.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index 516b10fa021..09fa3a247e6 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -182,19 +182,20 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
   std::string data_layout = "NHWC";
   attrs.insert({"data_layout", data_layout});
 
-  auto op = framework::OpRegistry::CreateOp(
-      "batch_norm", {{"X", {"X"}},
-                     {"Scale", {"Scale"}},
-                     {"Bias", {"Bias"}},
-                     {"Mean", {"Mean"}},
-                     {"Variance", {"Variance"}}},
-      {{"Y", {"Y"}},
-       {"MeanOut", {"Mean"}},
-       {"VarianceOut", {"Variance"}},
-       {"SavedMean", {"SavedMean"}},
-       {"SavedVariance", {"SavedVariance"}},
-       {"ReserveSpace", {"ReserveSpace"}}},
-      attrs);
+  auto op =
+      framework::OpRegistry::CreateOp("batch_norm",
+                                      {{"X", {"X"}},
+                                       {"Scale", {"Scale"}},
+                                       {"Bias", {"Bias"}},
+                                       {"Mean", {"Mean"}},
+                                       {"Variance", {"Variance"}}},
+                                      {{"Y", {"Y"}},
+                                       {"MeanOut", {"Mean"}},
+                                       {"VarianceOut", {"Variance"}},
+                                       {"SavedMean", {"SavedMean"}},
+                                       {"SavedVariance", {"SavedVariance"}},
+                                       {"ReserveSpace", {"ReserveSpace"}}},
+                                      attrs);
   op->Run(scope, ctx.GetPlace());
 
   paddle::framework::TensorCopySync(*y, platform::CPUPlace(), cpu_y);
@@ -314,8 +315,9 @@ void ComputeFusedBNAddReluBackward(
   attrs.insert({"epsilon", epsilon});
   attrs.insert({"act_type", act_type});
 
-  auto op = framework::OpRegistry::CreateOp(
-      "fused_bn_add_activation_grad", {{"X", {"X"}},
+  auto op =
+      framework::OpRegistry::CreateOp("fused_bn_add_activation_grad",
+                                      {{"X", {"X"}},
                                        {"Y", {"Y"}},
                                        {"Y@GRAD", {"Y@GRAD"}},
                                        {"Scale", {"Scale"}},
@@ -323,11 +325,11 @@ void ComputeFusedBNAddReluBackward(
                                        {"SavedMean", {"SavedMean"}},
                                        {"SavedVariance", {"SavedVariance"}},
                                        {"ReserveSpace", {"ReserveSpace"}}},
-      {{"X@GRAD", {"X@GRAD"}},
-       {"Z@GRAD", {"Z@GRAD"}},
-       {"Scale@GRAD", {"Scale@GRAD"}},
-       {"Bias@GRAD", {"Bias@GRAD"}}},
-      attrs);
+                                      {{"X@GRAD", {"X@GRAD"}},
+                                       {"Z@GRAD", {"Z@GRAD"}},
+                                       {"Scale@GRAD", {"Scale@GRAD"}},
+                                       {"Bias@GRAD", {"Bias@GRAD"}}},
+                                      attrs);
   op->Run(scope, ctx.GetPlace());
 
   paddle::framework::TensorCopySync(*dx, platform::CPUPlace(), cpu_dx);
diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
index 13fad0b7cbb..a8f700c2119 100644
--- a/paddle/fluid/operators/fused/cudnn_fusion_helper.h
+++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 5881322007a..f4443bba3fd 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -167,9 +167,10 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
   attrs.insert({"workspace_size_MB", 512});
 
   auto op = framework::OpRegistry::CreateOp(
-      "conv2d_grad", {{"Input", {"Input"}},
-                      {"Filter", {"Filter"}},
-                      {"Output@GRAD", {"Output@GRAD"}}},
+      "conv2d_grad",
+      {{"Input", {"Input"}},
+       {"Filter", {"Filter"}},
+       {"Output@GRAD", {"Output@GRAD"}}},
       {{"Input@GRAD", {"Input@GRAD"}}, {"Filter@GRAD", {"Filter@GRAD"}}},
       attrs);
   op->Run(scope, ctx.GetPlace());
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 38f9aff226e..ce95b0a320c 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -186,8 +186,9 @@ class FMHARef {
     if (dropout_param_.dropout_prob_) {
       DropoutFwGPUKernelDriver<T>(
           static_cast<const phi::GPUContext&>(dev_ctx_),
-          dropout_param_.is_test_, static_cast<const std::string>(
-                                       dropout_param_.dropout_implementation_),
+          dropout_param_.is_test_,
+          static_cast<const std::string>(
+              dropout_param_.dropout_implementation_),
           dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_,
           dropout_param_.is_fix_seed_, dropout_param_.seed_val_,
           static_cast<const Tensor&>(*softmax_out_tensor), dropout_param_.seed_,
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index a1adec9641a..06ede8e2c7b 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -88,12 +89,13 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     // y: qkv's weight: [3, num_head, dim_head, dim_embed]
     auto x_dim = ctx->GetInputDim("X");
     auto y_dim = ctx->GetInputDim("QKVW");
-    PADDLE_ENFORCE_EQ(x_dim.size(), 3, platform::errors::InvalidArgument(
-                                           "The dimensions of x must be 3"
-                                           "(batch_size, seq_len, dim_embed),"
-                                           "but received dimensions of"
-                                           "Input is [%d]",
-                                           x_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        x_dim.size(), 3,
+        platform::errors::InvalidArgument("The dimensions of x must be 3"
+                                          "(batch_size, seq_len, dim_embed),"
+                                          "but received dimensions of"
+                                          "Input is [%d]",
+                                          x_dim.size()));
     PADDLE_ENFORCE_EQ(y_dim.size(), 4,
                       platform::errors::InvalidArgument(
                           "The dimensions of qkv_weight must be 4"
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index f25bd539928..73fdd29fd62 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -13,21 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cuda_fp16.h>
+
 #include <cub/cub.cuh>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/fused/attention_layer_norm.h"
+#include "paddle/fluid/operators/fused/attn_gemm.h"
+#include "paddle/fluid/operators/fused/fmha_ref.h"
+#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/operators/fused/attention_layer_norm.h"
-#include "paddle/fluid/operators/fused/attn_gemm.h"
-#include "paddle/fluid/operators/fused/fmha_ref.h"
-#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
-
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -463,11 +463,13 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
       auto *bias_dropout_residual_out_data =
           bias_dropout_residual_out->data<T>();
       auto *d_ln_2_scale_data =
-          (d_ln_2_scale == nullptr ? nullptr : d_ln_2_scale->mutable_data<U>(
-                                                   ctx.GetPlace()));
+          (d_ln_2_scale == nullptr
+               ? nullptr
+               : d_ln_2_scale->mutable_data<U>(ctx.GetPlace()));
       auto *d_ln_2_bias_data =
-          (d_ln_2_bias == nullptr ? nullptr : d_ln_2_bias->mutable_data<U>(
-                                                  ctx.GetPlace()));
+          (d_ln_2_bias == nullptr
+               ? nullptr
+               : d_ln_2_bias->mutable_data<U>(ctx.GetPlace()));
       auto *d_bias_dropout_residual_out_data =
           d_bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
index 781f51d70ec..56f9afdbe90 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
index 71a2c9728cc..35a48611a74 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cuda_fp16.h>
+
 #include <cub/cub.cuh>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
index 1b3521f1496..464856003f0 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_bn_activation_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -70,20 +72,22 @@ void FusedBatchNormActOp::InferShape(framework::InferShapeContext *ctx) const {
 
   const auto x_dims = ctx->GetInputDim("X");
 
-  PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::PreconditionNotMet(
-                                          "ShapeError: the dimension of input "
-                                          "X must greater than or equal to 2."
-                                          "But received: the shape of input X "
-                                          "= [%s], the dimension of input X ="
-                                          "[%d]",
-                                          x_dims, x_dims.size()));
-  PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::PreconditionNotMet(
-                                          "ShapeError: the dimension of input "
-                                          "X must smaller than or equal to 5."
-                                          "But received: the shape of input X "
-                                          "= [%s], the dimension of input X ="
-                                          "[%d]",
-                                          x_dims, x_dims.size()));
+  PADDLE_ENFORCE_GE(
+      x_dims.size(), 2,
+      platform::errors::PreconditionNotMet("ShapeError: the dimension of input "
+                                           "X must greater than or equal to 2."
+                                           "But received: the shape of input X "
+                                           "= [%s], the dimension of input X ="
+                                           "[%d]",
+                                           x_dims, x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(), 5,
+      platform::errors::PreconditionNotMet("ShapeError: the dimension of input "
+                                           "X must smaller than or equal to 5."
+                                           "But received: the shape of input X "
+                                           "= [%s], the dimension of input X ="
+                                           "[%d]",
+                                           x_dims, x_dims.size()));
 
   const int64_t C = x_dims[x_dims.size() - 1];
 
@@ -140,22 +144,26 @@ framework::OpKernelType FusedBatchNormActOp::GetExpectedKernelType(
   if (input_data_type == framework::proto::VarType::FP64) {
     bn_param_type = framework::proto::VarType::FP64;
   }
-  PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                       ctx.Input<Tensor>("Scale")->dtype()),
-                    platform::errors::PreconditionNotMet(
-                        "Scale input should be of float type"));
-  PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                       ctx.Input<Tensor>("Bias")->dtype()),
-                    platform::errors::PreconditionNotMet(
-                        "Bias input should be of float type"));
-  PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                       ctx.Input<Tensor>("Mean")->dtype()),
-                    platform::errors::PreconditionNotMet(
-                        "Mean input should be of float type"));
-  PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                       ctx.Input<Tensor>("Variance")->dtype()),
-                    platform::errors::PreconditionNotMet(
-                        "Variance input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(ctx.Input<Tensor>("Scale")->dtype()),
+      platform::errors::PreconditionNotMet(
+          "Scale input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(ctx.Input<Tensor>("Bias")->dtype()),
+      platform::errors::PreconditionNotMet(
+          "Bias input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(ctx.Input<Tensor>("Mean")->dtype()),
+      platform::errors::PreconditionNotMet(
+          "Mean input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type,
+      framework::TransToProtoVarType(ctx.Input<Tensor>("Variance")->dtype()),
+      platform::errors::PreconditionNotMet(
+          "Variance input should be of float type"));
 
   framework::LibraryType library = framework::LibraryType::kPlain;
   framework::DataLayout layout = framework::DataLayout::kAnyLayout;
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index 9e709c9a01a..0ebe21dfc60 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -16,6 +16,7 @@
 #include <cfloat>
 #include <string>
 #include <vector>
+
 #include "cub/cub.cuh"
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/activation_op.h"
@@ -181,8 +182,9 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
                 ctx.GetPlace()),
             variance_out->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace()),
-            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
-                         ctx.GetPlace()),
+            epsilon,
+            saved_mean->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
             saved_variance->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace()),
             activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
@@ -343,10 +345,12 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
             /*dBnScaleBiasDesc=*/bn_param_desc_,
             /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
             /*bnBiasData=*/bias->template data<BatchNormParamType<T>>(),
-            /*dBnScaleData=*/d_scale
-                ->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
-            /*dBnBiasData=*/d_bias
-                ->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+            /*dBnScaleData=*/
+            d_scale->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            /*dBnBiasData=*/
+            d_bias->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
             /*epsilon=*/epsilon,
             /*savedMean=*/saved_mean_data,
             /*savedInvVariance=*/saved_var_data,
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.h b/paddle/fluid/operators/fused/fused_bn_activation_op.h
index b8404e4c655..da9bca4fc22 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
index d667fafb835..5d06ac19f9e 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -52,20 +54,22 @@ void FusedBatchNormAddActOp::InferShape(
                         "of input X = [%s], and the shape of "
                         "input Y = [%s]",
                         x_dims, z_dims));
-  PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::InvalidArgument(
-                                          "ShapeError: the dimensions of input "
-                                          "must greater than or equal to 2."
-                                          "But received: the shape of input "
-                                          "= [%s], the dimension of input = "
-                                          "[%d]",
-                                          x_dims, x_dims.size()));
-  PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::InvalidArgument(
-                                          "ShapeError: the dimensions of input "
-                                          "must smaller than or equal to 5."
-                                          "But received: the shape of input "
-                                          "= [%s], the dimension of input = "
-                                          "[%d]",
-                                          x_dims, x_dims.size()));
+  PADDLE_ENFORCE_GE(
+      x_dims.size(), 2,
+      platform::errors::InvalidArgument("ShapeError: the dimensions of input "
+                                        "must greater than or equal to 2."
+                                        "But received: the shape of input "
+                                        "= [%s], the dimension of input = "
+                                        "[%d]",
+                                        x_dims, x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(), 5,
+      platform::errors::InvalidArgument("ShapeError: the dimensions of input "
+                                        "must smaller than or equal to 5."
+                                        "But received: the shape of input "
+                                        "= [%s], the dimension of input = "
+                                        "[%d]",
+                                        x_dims, x_dims.size()));
 
   const int64_t C = x_dims[x_dims.size() - 1];
 
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
index 421c1bacb66..2f7fc616012 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -16,6 +16,7 @@
 #include <cfloat>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
@@ -160,8 +161,9 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
                 ctx.GetPlace()),
             variance_out->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace()),
-            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
-                         ctx.GetPlace()),
+            epsilon,
+            saved_mean->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
             saved_variance->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace()),
             activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
index d5e5ae9bda6..07d2e4564b6 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
old mode 100755
new mode 100644
index 9f5a1bad047..f7af7deff53
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
@@ -109,15 +109,15 @@ void LaunchDropoutActBias(Functor act_functor, const uint64_t seed,
   const int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
   const auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size);
   if (cols % VecSize == 0) {
-    FusedDropoutActBias<T, MaskType, VecSize, Functor><<<
-        config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-        act_functor, seed, rows, cols, increment, dropout_prob,
-        is_upscale_in_train, is_test, src, bias, dst, mask_data);
+    FusedDropoutActBias<T, MaskType, VecSize, Functor>
+        <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+            act_functor, seed, rows, cols, increment, dropout_prob,
+            is_upscale_in_train, is_test, src, bias, dst, mask_data);
   } else {
-    FusedDropoutActBias<T, MaskType, 1, Functor><<<
-        config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-        act_functor, seed, rows, cols, increment, dropout_prob,
-        is_upscale_in_train, is_test, src, bias, dst, mask_data);
+    FusedDropoutActBias<T, MaskType, 1, Functor>
+        <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+            act_functor, seed, rows, cols, increment, dropout_prob,
+            is_upscale_in_train, is_test, src, bias, dst, mask_data);
   }
 }
 
@@ -231,28 +231,28 @@ void LaunchDropoutActBiasGrad(Functor act_functor, const T *dout,
     dim3 block_dim(threads, 128, 1);
     dim3 grid_dim(blocks, 1, 1);
     if (cols % VecSize == 0) {
-      FusedDropoutActBiasGrad<
-          T, MaskType, 8, 128, VecSize,
-          Functor><<<grid_dim, block_dim, 0, ctx.stream()>>>(
-          act_functor, dout, mask, src, bias, factor, rows, cols, dx, dbias);
+      FusedDropoutActBiasGrad<T, MaskType, 8, 128, VecSize, Functor>
+          <<<grid_dim, block_dim, 0, ctx.stream()>>>(act_functor, dout, mask,
+                                                     src, bias, factor, rows,
+                                                     cols, dx, dbias);
     } else {
-      FusedDropoutActBiasGrad<
-          T, MaskType, 8, 128, 1,
-          Functor><<<grid_dim, block_dim, 0, ctx.stream()>>>(
-          act_functor, dout, mask, src, bias, factor, rows, cols, dx, dbias);
+      FusedDropoutActBiasGrad<T, MaskType, 8, 128, 1, Functor>
+          <<<grid_dim, block_dim, 0, ctx.stream()>>>(act_functor, dout, mask,
+                                                     src, bias, factor, rows,
+                                                     cols, dx, dbias);
     }
   } else {
     const uint64_t n = rows * cols;
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(ctx, n / real_vec_size);
     if (n % VecSize == 0) {
-      FusedDropoutActGrad<T, MaskType, VecSize, Functor><<<
-          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          act_functor, dout, mask, src, factor, n, dx);
+      FusedDropoutActGrad<T, MaskType, VecSize, Functor>
+          <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+              act_functor, dout, mask, src, factor, n, dx);
     } else {
-      FusedDropoutActGrad<T, MaskType, 1, Functor><<<
-          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          act_functor, dout, mask, src, factor, n, dx);
+      FusedDropoutActGrad<T, MaskType, 1, Functor>
+          <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+              act_functor, dout, mask, src, factor, n, dx);
     }
   }
 }
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index c352f08ec2b..6dc1c446bd7 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -30,7 +30,7 @@ namespace operators {
  * The DropoutParam will be used in the fused_dropout_act_bias,
  * fused_residual_dropout_bias(pre_layer_norm=ture) or
  * fused_layernorm_residual_dropout_bias(pre_layer_norm=false).
-*/
+ */
 struct DropoutParam {
   uint64_t seed;
   float dropout_prob;
@@ -232,8 +232,8 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
     using U = LayerNormParamType<T>;
     switch (GetDesiredBlockDim(this->cols_)) {
       FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<
-              T, U, kBlockDim><<<this->rows_, kBlockDim, 0, ctx.stream()>>>(
+          LayerNormForward<T, U, kBlockDim>
+          <<<this->rows_, kBlockDim, 0, ctx.stream()>>>(
               src, gamma, beta, out, mean, variance, epsilon_, this->cols_));
     }
   }
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
index 3e69bf08067..a43562b2972 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_elemwise_activation_op.h"
+
 #include <memory>
 #include <unordered_set>
 
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
index 5404cdeab01..3ce54968355 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
@@ -412,8 +413,9 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto in_y = ctx.Input<framework::Tensor>("Y");
-    PADDLE_ENFORCE_NE(in_y, nullptr, platform::errors::InvalidArgument(
-                                         "Input(Y) should not be nullptr."));
+    PADDLE_ENFORCE_NE(
+        in_y, nullptr,
+        platform::errors::InvalidArgument("Input(Y) should not be nullptr."));
     auto in_out = ctx.Input<framework::Tensor>("Out");
     PADDLE_ENFORCE_NE(
         in_out, nullptr,
@@ -449,15 +451,17 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
                             " so the number of 'Out' should be two."));
     } else {
       if (!InputXCanBeAbsent(functor_list)) {
-        PADDLE_ENFORCE_NE(in_x, nullptr, platform::errors::InvalidArgument(
-                                             "Input(X) should not be null."));
+        PADDLE_ENFORCE_NE(
+            in_x, nullptr,
+            platform::errors::InvalidArgument("Input(X) should not be null."));
       }
     }
 
     // Get in_x
     if (ctx.HasInput("X")) {
-      PADDLE_ENFORCE_NE(in_x, nullptr, platform::errors::InvalidArgument(
-                                           "Input(X) should not be null."));
+      PADDLE_ENFORCE_NE(
+          in_x, nullptr,
+          platform::errors::InvalidArgument("Input(X) should not be null."));
     } else {
       // If functor_list contains elementwise_add, the backward doesn't use
       // in_x, in_y and in_out.
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
index 6746b3b8e84..951189269c7 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/errors.h"
 
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 13f1c6808ae..f0cb2edb670 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include <algorithm>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 7308f307792..625bfe36e38 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h"
+
 #include <string>
+
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/cpu_vec.h"
@@ -100,10 +102,11 @@ void FusedEmbeddingFCLSTMOp::InferShape(
       platform::errors::InvalidArgument(
           "The rank of Input(Bias) should be 2, but received value is:%d.",
           b_dims.size()));
-  PADDLE_ENFORCE_EQ(b_dims[0], 1, platform::errors::InvalidArgument(
-                                      "The first dimension of Input(Bias) "
-                                      "should be 1, but received value is:%d.",
-                                      b_dims[0]));
+  PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                    platform::errors::InvalidArgument(
+                        "The first dimension of Input(Bias) "
+                        "should be 1, but received value is:%d.",
+                        b_dims[0]));
   PADDLE_ENFORCE_EQ(
       b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
       platform::errors::InvalidArgument(
@@ -237,21 +240,21 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
 template <typename T>
 class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
  public:
-#define INIT_VEC_FUNC                                                          \
-  std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
-  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
-  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
-  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
-  if (platform::MayIUse(platform::avx)) {                                      \
-    phi::funcs::VecActivations<T, platform::avx> act_functor;                  \
-    act_gate = act_functor(act_gate_str);                                      \
-    act_cell = act_functor(act_cell_str);                                      \
-    act_cand = act_functor(act_cand_str);                                      \
-  } else {                                                                     \
-    phi::funcs::VecActivations<T, platform::isa_any> act_functor;              \
-    act_gate = act_functor(act_gate_str);                                      \
-    act_cell = act_functor(act_cell_str);                                      \
-    act_cand = act_functor(act_cand_str);                                      \
+#define INIT_VEC_FUNC                                                        \
+  std::function<void(const int, const T*, T*)> act_gate, act_cell, act_cand; \
+  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");             \
+  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");             \
+  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");        \
+  if (platform::MayIUse(platform::avx)) {                                    \
+    phi::funcs::VecActivations<T, platform::avx> act_functor;                \
+    act_gate = act_functor(act_gate_str);                                    \
+    act_cell = act_functor(act_cell_str);                                    \
+    act_cand = act_functor(act_cand_str);                                    \
+  } else {                                                                   \
+    phi::funcs::VecActivations<T, platform::isa_any> act_functor;            \
+    act_gate = act_functor(act_gate_str);                                    \
+    act_cell = act_functor(act_cell_str);                                    \
+    act_cand = act_functor(act_cand_str);                                    \
   }
 
 #define INIT_BASE_INPUT_OUTPUT                        \
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
index ec3a76e316e..cb3bf585775 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/var_type_inference.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
index 04d3730a77d..2c0184fea46 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -179,22 +179,20 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
     if (with_relu) {
       switch (platform::RoundToPowerOfTwo(N)) {
         CUDA_LAUNCH_KERNEL_HELPER(
-            InplaceAddReluAddLayerNormKernel<
-                T, true,
-                kPowerOfTwoDim><<<std::max(max_threads / kPowerOfTwoDim, 1),
-                                  kPowerOfTwoDim, 0, dev_ctx.stream()>>>(
-                y_data, bias_0_data, bias_1_data, scale_data, out_data,
-                mean_data, variance_data, M, N, epsilon));
+            InplaceAddReluAddLayerNormKernel<T, true, kPowerOfTwoDim>
+            <<<std::max(max_threads / kPowerOfTwoDim, 1), kPowerOfTwoDim, 0,
+               dev_ctx.stream()>>>(y_data, bias_0_data, bias_1_data, scale_data,
+                                   out_data, mean_data, variance_data, M, N,
+                                   epsilon));
       }
     } else {
       switch (platform::RoundToPowerOfTwo(N)) {
         CUDA_LAUNCH_KERNEL_HELPER(
-            InplaceAddReluAddLayerNormKernel<
-                T, false,
-                kPowerOfTwoDim><<<std::max(max_threads / kPowerOfTwoDim, 1),
-                                  kPowerOfTwoDim, 0, dev_ctx.stream()>>>(
-                y_data, bias_0_data, bias_1_data, scale_data, out_data,
-                mean_data, variance_data, M, N, epsilon));
+            InplaceAddReluAddLayerNormKernel<T, false, kPowerOfTwoDim>
+            <<<std::max(max_threads / kPowerOfTwoDim, 1), kPowerOfTwoDim, 0,
+               dev_ctx.stream()>>>(y_data, bias_0_data, bias_1_data, scale_data,
+                                   out_data, mean_data, variance_data, M, N,
+                                   epsilon));
       }
     }
   }
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index 8e15232acda..d3cc1b91276 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/matmul_v2_op.h"
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 2eb9885286d..675ec29da67 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/matmul_v2_op.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/fluid/operators/matmul_v2_op.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
@@ -387,20 +386,19 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
         !pre_layer_norm ? context.Input<framework::Tensor>("Ln2Bias") : nullptr;
 
     auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* d_ln1_scale = pre_layer_norm
-                            ? context.Output<framework::Tensor>(
-                                  framework::GradVarName("Ln1Scale"))
-                            : nullptr;
-    auto* d_ln1_bias = pre_layer_norm
-                           ? context.Output<framework::Tensor>(
-                                 framework::GradVarName("Ln1Bias"))
-                           : nullptr;
-    auto* d_ln2_scale =
-        pre_layer_norm ? nullptr : context.Output<framework::Tensor>(
-                                       framework::GradVarName("Ln2Scale"));
-    auto* d_ln2_bias =
-        pre_layer_norm ? nullptr : context.Output<framework::Tensor>(
-                                       framework::GradVarName("Ln2Bias"));
+    auto* d_ln1_scale = pre_layer_norm ? context.Output<framework::Tensor>(
+                                             framework::GradVarName("Ln1Scale"))
+                                       : nullptr;
+    auto* d_ln1_bias = pre_layer_norm ? context.Output<framework::Tensor>(
+                                            framework::GradVarName("Ln1Bias"))
+                                      : nullptr;
+    auto* d_ln2_scale = pre_layer_norm
+                            ? nullptr
+                            : context.Output<framework::Tensor>(
+                                  framework::GradVarName("Ln2Scale"));
+    auto* d_ln2_bias = pre_layer_norm ? nullptr
+                                      : context.Output<framework::Tensor>(
+                                            framework::GradVarName("Ln2Bias"));
     auto* d_linear1_weight = context.Output<framework::Tensor>(
         framework::GradVarName("Linear1Weight"));
     auto* d_linear1_bias = context.Output<framework::Tensor>(
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
index ba9dbd82e3d..0bbeabd5fc9 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
index b1badf72557..8f375a22cc0 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -374,9 +374,9 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
         v_transpose_out, qkv_transpose_out, softmax_out, fmha_out, &config);
 
     // 3. Gating Linear
-    Tensor *fmha_or_gate_out =
-        !has_gating ? fmha_out : ComputeGatingLinearForward<T>(ctx, config,
-                                                               query, fmha_out);
+    Tensor *fmha_or_gate_out = !has_gating ? fmha_out
+                                           : ComputeGatingLinearForward<T>(
+                                                 ctx, config, query, fmha_out);
 
     // 4. Output Linear
     ComputeOutputLinearForward<T>(ctx, config, fmha_or_gate_out);
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
index 7cb6777e5a7..978daa3be85 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -369,8 +370,9 @@ class FusedGemmEpilogueOpGradMaker : public framework::SingleGradOpMaker<T> {
  protected:
   void Apply(GradOpPtr<T> op) const override {
     const auto& act_type = this->template Attr<std::string>("activation");
-    PADDLE_ENFORCE_EQ(act_type, "none", phi::errors::InvalidArgument(
-                                            "The activation should be none."));
+    PADDLE_ENFORCE_EQ(
+        act_type, "none",
+        phi::errors::InvalidArgument("The activation should be none."));
 
     op->SetType(this->ForwardOpType() + "_grad");
     op->SetInput("X", this->Input("X"));
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
index 8ff41b2c961..b00bdfe5660 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
@@ -16,9 +16,11 @@ limitations under the License. */
 #pragma once
 
 #include <cuda_runtime_api.h>
+
 #include <algorithm>
 #include <mutex>
 #include <unordered_map>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index 9d7d34ebdc9..f72f73438c0 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -441,11 +441,10 @@ void LaunchLayernormResidualDropoutBias(
     // call layernorm forward
     switch (GetDesiredBlockDim(cols)) {
       FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<
-              T, U, kBlockDim,
-              ScaleBiasWithSameTypeX><<<rows, kBlockDim, 0, ctx.stream()>>>(
-              dst, scale, layernorm_bias, layernorm_dst, mean, var, epsilon,
-              cols));
+          LayerNormForward<T, U, kBlockDim, ScaleBiasWithSameTypeX>
+          <<<rows, kBlockDim, 0, ctx.stream()>>>(dst, scale, layernorm_bias,
+                                                 layernorm_dst, mean, var,
+                                                 epsilon, cols));
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Product from begin_norm_axis to end must be larger than 1"));
@@ -468,11 +467,11 @@ void LaunchLayernormResidualDropoutBias(
         static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA))); \
     fused_fast_ln_fwd_kernel<                                                 \
         T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, uint8_t,     \
-        VecSize, WARPS_M, WARPS_N, BYTES_PER_LDG,                             \
-        cols><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(                    \
-        rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,         \
-        increment, epsilon, src, residual, bias, scale, layernorm_bias,       \
-        mask_data, mean, var, dst, layernorm_dst);                            \
+        VecSize, WARPS_M, WARPS_N, BYTES_PER_LDG, cols>                       \
+        <<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(                         \
+            rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,     \
+            increment, epsilon, src, residual, bias, scale, layernorm_bias,   \
+            mask_data, mean, var, dst, layernorm_dst);                        \
   } break
 
 #define LAUNCH_FUSED_FAST_LN_KERNEL       \
@@ -494,12 +493,11 @@ void LaunchLayernormResidualDropoutBias(
   const int VecSize = MAX_CACHE_BYTES / sizeof(T);
   if (cols % VecSize != 0) {
     int blockDim = GetDesiredBlockDim(cols);
-    FusedLayernormResidualDropoutBias<
-        T, uint8_t, 1, U,
-        ScaleBiasWithSameTypeX><<<rows, blockDim, 0, ctx.stream()>>>(
-        rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, increment,
-        epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
-        layernorm_dst, mean, var);
+    FusedLayernormResidualDropoutBias<T, uint8_t, 1, U, ScaleBiasWithSameTypeX>
+        <<<rows, blockDim, 0, ctx.stream()>>>(
+            rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,
+            increment, epsilon, src, residual, bias, scale, layernorm_bias,
+            mask_data, dst, layernorm_dst, mean, var);
   } else {
     if (can_call_fast_ln_kernel) {
       switch (cols) {
@@ -512,12 +510,12 @@ void LaunchLayernormResidualDropoutBias(
       }
     } else {
       int blockDim = GetDesiredBlockDim(cols / VecSize);
-      FusedLayernormResidualDropoutBias<
-          T, uint8_t, VecSize, U,
-          ScaleBiasWithSameTypeX><<<rows, blockDim, 0, ctx.stream()>>>(
-          rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,
-          increment, epsilon, src, residual, bias, scale, layernorm_bias,
-          mask_data, dst, layernorm_dst, mean, var);
+      FusedLayernormResidualDropoutBias<T, uint8_t, VecSize, U,
+                                        ScaleBiasWithSameTypeX>
+          <<<rows, blockDim, 0, ctx.stream()>>>(
+              rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,
+              increment, epsilon, src, residual, bias, scale, layernorm_bias,
+              mask_data, dst, layernorm_dst, mean, var);
     }
   }
 }
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
index 98602e4edd0..63627db49d6 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -62,12 +63,13 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel {
     // y: qkv's weight: [3, num_head, dim_head, dim_embed]
     auto x_dim = ctx->GetInputDim("X");
     auto y_dim = ctx->GetInputsDim("QKVW")[0];
-    PADDLE_ENFORCE_EQ(x_dim.size(), 3, platform::errors::InvalidArgument(
-                                           "The dimensions of x must be 3"
-                                           "(batch_size, seq_len, dim_embed),"
-                                           "but received dimensions of"
-                                           "Input is [%d]",
-                                           x_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        x_dim.size(), 3,
+        platform::errors::InvalidArgument("The dimensions of x must be 3"
+                                          "(batch_size, seq_len, dim_embed),"
+                                          "but received dimensions of"
+                                          "Input is [%d]",
+                                          x_dim.size()));
     PADDLE_ENFORCE_EQ(y_dim.size(), 4,
                       platform::errors::InvalidArgument(
                           "The dimensions of qkv_weight must be 4"
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index c13c287f4af..01c5b79fff1 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -18,18 +18,18 @@ limitations under the License. */
 
 #include <cuda_fp16.h>
 #include <float.h>
+
 #include <cub/cub.cuh>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/fused/attention_layer_norm.h"
 #include "paddle/fluid/operators/fused/attn_gemm.h"
 #include "paddle/fluid/operators/fused/fmha_ref.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -861,10 +861,9 @@ inline size_t smem_size_in_bytes(
   size_t smem_sz = smem_size_in_bytes<T>(params, Dh, THDS_PER_VALUE,           \
                                          THDS_PER_BLOCK, pad_active_groups);   \
   dim3 grid(params.num_head, params.batch_size);                               \
-  masked_multihead_attention_kernel<                                           \
-      T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE,                             \
-      THDS_PER_BLOCK><<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(              \
-      params, pad_active_groups)
+  masked_multihead_attention_kernel<T, Dh, Dh_MAX, THDS_PER_KEY,               \
+                                    THDS_PER_VALUE, THDS_PER_BLOCK>            \
+      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params, pad_active_groups)
 
 template <typename T, int Dh, int Dh_MAX>
 void fmha_launch_kernel(const Masked_multihead_attention_params<T> &params,
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index 1d3085a013f..0cc31e6fc32 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -153,16 +153,15 @@ void LaunchResidualDropoutBias(const uint32_t rows, const uint32_t cols,
   const int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
   auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size);
   if (cols % VecSize == 0) {
-    FusedResidualDropoutBias<T, uint8_t, VecSize><<<
-        config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-        rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
-        bias, mask_data, dst, increment, is_test);
+    FusedResidualDropoutBias<T, uint8_t, VecSize>
+        <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+            rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
+            bias, mask_data, dst, increment, is_test);
   } else {
-    FusedResidualDropoutBias<
-        T, uint8_t,
-        1><<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-        rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
-        bias, mask_data, dst, increment, is_test);
+    FusedResidualDropoutBias<T, uint8_t, 1>
+        <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+            rows, cols, seed, dropout_prob, is_upscale_in_train, src, residual,
+            bias, mask_data, dst, increment, is_test);
   }
 }
 
@@ -263,27 +262,26 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const MaskType *mask,
     dim3 block_dim(threads, 128, 1);
     dim3 grid_dim(blocks, 1, 1);
     if (cols % VecSize == 0) {
-      FusedResidualDropoutBiasGrad<
-          T, MaskType, 8, 128,
-          VecSize><<<grid_dim, block_dim, 0, ctx.stream()>>>(
-          dout, mask, factor, rows, cols, dx, dbias);
+      FusedResidualDropoutBiasGrad<T, MaskType, 8, 128, VecSize>
+          <<<grid_dim, block_dim, 0, ctx.stream()>>>(dout, mask, factor, rows,
+                                                     cols, dx, dbias);
     } else {
-      FusedResidualDropoutBiasGrad<T, MaskType, 8, 128,
-                                   1><<<grid_dim, block_dim, 0, ctx.stream()>>>(
-          dout, mask, factor, rows, cols, dx, dbias);
+      FusedResidualDropoutBiasGrad<T, MaskType, 8, 128, 1>
+          <<<grid_dim, block_dim, 0, ctx.stream()>>>(dout, mask, factor, rows,
+                                                     cols, dx, dbias);
     }
   } else {
     const uint64_t n = rows * cols;
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(ctx, n / real_vec_size);
     if (n % VecSize == 0) {
-      FusedResidualDropoutGrad<T, MaskType, VecSize><<<
-          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          dout, mask, factor, n, dx);
+      FusedResidualDropoutGrad<T, MaskType, VecSize>
+          <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+              dout, mask, factor, n, dx);
     } else {
-      FusedResidualDropoutGrad<T, MaskType, 1><<<
-          config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          dout, mask, factor, n, dx);
+      FusedResidualDropoutGrad<T, MaskType, 1>
+          <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+              dout, mask, factor, n, dx);
     }
   }
 }
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
index 23b82ac5d96..e316f58b3f7 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_seqpool_cvm_op.h"
+
 #include <string>
 namespace paddle {
 namespace operators {
@@ -34,9 +35,10 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         cvm_dims.size(), 2UL,
         platform::errors::InvalidArgument("Input(CVM)'s rank should be 2."));
-    PADDLE_ENFORCE_EQ(cvm_dims[1], 2UL, platform::errors::InvalidArgument(
-                                            "The 2nd dimension of "
-                                            "Input(CVM) should be 2."));
+    PADDLE_ENFORCE_EQ(
+        cvm_dims[1], 2UL,
+        platform::errors::InvalidArgument("The 2nd dimension of "
+                                          "Input(CVM) should be 2."));
 
     auto ins_dims = ctx->GetInputsDim("X");
     const int cvm_offset = ctx->Attrs().Get<int>("cvm_offset");
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
index 3770a536a8f..2b6b7d49345 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <string>
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/operators/fused/fused_seqpool_cvm_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h
index 6042772adb0..e3bc424f259 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
index 11f1011dec3..4c00f778ced 100644
--- a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
+++ b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
@@ -114,10 +114,9 @@ __global__ void FusedSoftmaxMaskVecKernel(T* dst, const T* src, const T* mask,
   }
 }
 
-#define SOFTMAX_MASK_KERNEL(VEC_SIZE, ELEMENTS)                    \
-  FusedSoftmaxMaskVecKernel<T, VEC_SIZE,                           \
-                            ELEMENTS><<<grid, block, 0, stream>>>( \
-      dst, src, mask, seq_len)
+#define SOFTMAX_MASK_KERNEL(VEC_SIZE, ELEMENTS)    \
+  FusedSoftmaxMaskVecKernel<T, VEC_SIZE, ELEMENTS> \
+      <<<grid, block, 0, stream>>>(dst, src, mask, seq_len)
 
 // FIXME(wangxi): It is found that the performance of VEC_SIZE=2 is better
 //  than that of =4 and =8. Further analysis of the kernel is needed later.
diff --git a/paddle/fluid/operators/fused/fused_transformer_op.cc b/paddle/fluid/operators/fused/fused_transformer_op.cc
index 9e5fc42fc76..d11171eb2d0 100644
--- a/paddle/fluid/operators/fused/fused_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_transformer_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fused_transformer_op.h"
+
 #include <string>
 
 namespace paddle {
@@ -157,5 +158,5 @@ void FusedMHA<T>::ComputeForward(T* output, T* softmax_mask) {}
 template <typename T>
 void FusedMHA<T>::ComputeBackward(const T* grad_output, T* softmax_mask,
                                   T* grad_x) {}
-}
-}
\ No newline at end of file
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_transformer_op.h b/paddle/fluid/operators/fused/fused_transformer_op.h
index 2d2d390d243..a2d5862abf0 100644
--- a/paddle/fluid/operators/fused/fused_transformer_op.h
+++ b/paddle/fluid/operators/fused/fused_transformer_op.h
@@ -151,5 +151,5 @@ class FusedTransformerEncoderLayer {
 
   std::string act_method;
 };
-}
-}
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
index eeeb004003c..802cd18e1db 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
@@ -35,8 +36,9 @@ class ConvInceptionFusionOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         in_dims.size(), 4,
         platform::errors::InvalidArgument("Conv intput should be 4-D tensor."));
-    PADDLE_ENFORCE_EQ(w_dims.size(), 4, platform::errors::InvalidArgument(
-                                            "There should be 4 filters."));
+    PADDLE_ENFORCE_EQ(
+        w_dims.size(), 4,
+        platform::errors::InvalidArgument("There should be 4 filters."));
     PADDLE_ENFORCE_EQ(w_dims[0][1], in_dims[1],
                       platform::errors::InvalidArgument(
                           "Invalid fileter channel number %d, which should be "
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cu.cc b/paddle/fluid/operators/fused/fusion_group_op.cu.cc
index 94949f56331..c592bbe7d3e 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_group_op.h"
+
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/fused/fusion_group_op.h b/paddle/fluid/operators/fused/fusion_group_op.h
index 5e5f2c60ffb..f71355b85d9 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.h
+++ b/paddle/fluid/operators/fused/fusion_group_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_code.h"
 
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index afbd5380a83..fd05155bc2c 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_gru_op.h"
+
 #include <cstring>  // for memcpy
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 3dada660aef..f2e6f099b4b 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_lstm_op.h"
+
 #include <string>
+
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index bed5125b995..c9d6d42efac 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -13,8 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernels.h"
 
 namespace paddle {
@@ -24,10 +26,11 @@ void FusionRepeatedFCReluOp::InferShape(
     framework::InferShapeContext* ctx) const {
   OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusionRepeatedFCRelu");
   auto sz = ctx->Inputs("W").size();
-  PADDLE_ENFORCE_GT(sz, 1UL, platform::errors::InvalidArgument(
-                                 "Inputs(W) of FusionRepeatedFCReluOp should "
-                                 "be greater than 1, but received value is %d.",
-                                 sz));
+  PADDLE_ENFORCE_GT(sz, 1UL,
+                    platform::errors::InvalidArgument(
+                        "Inputs(W) of FusionRepeatedFCReluOp should "
+                        "be greater than 1, but received value is %d.",
+                        sz));
   PADDLE_ENFORCE_EQ(
       ctx->Inputs("Bias").size(), sz,
       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
index ee28a548056..b99b53de9c4 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h"
+
 #include <algorithm>  // for min, max
 #include <string>
+
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
 
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index 58613173ad2..7341d1f864d 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h"
+
 #include <string>
+
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/cpu_vec.h"
@@ -48,8 +50,9 @@ void FusionSeqExpandConcatFCOp::InferShape(
   for (size_t i = 1; i < ins_dims.size(); ++i) {
     sum += ins_dims[i][1];
   }
-  PADDLE_ENFORCE_EQ(sum, w_dims[0], platform::errors::InvalidArgument(
-                                        "FC height should be sum of all inputs "
+  PADDLE_ENFORCE_EQ(
+      sum, w_dims[0],
+      platform::errors::InvalidArgument("FC height should be sum of all inputs "
                                         "width, but received FC height is: %d, "
                                         "sum of all inputs width is: %d.",
                                         w_dims[0], sum));
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
index e574d67e398..1d487ef3dab 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
@@ -13,8 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_seqpool_concat_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernels.h"
 
 namespace paddle {
@@ -29,17 +31,19 @@ void FusionSeqPoolConcatOp::InferShape(
                         ctx->Inputs("X").size()));
   OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FusionSeqPoolConcat");
   int axis = ctx->Attrs().Get<int>("axis");
-  PADDLE_ENFORCE_EQ(axis, 1, platform::errors::InvalidArgument(
-                                 "FusionSeqPoolConcatOp only supports concat "
-                                 "axis=1 yet, but received axis value is %d",
-                                 axis));
+  PADDLE_ENFORCE_EQ(axis, 1,
+                    platform::errors::InvalidArgument(
+                        "FusionSeqPoolConcatOp only supports concat "
+                        "axis=1 yet, but received axis value is %d",
+                        axis));
 
   auto ins_dims = ctx->GetInputsDim("X");
   const size_t n = ins_dims.size();
-  PADDLE_ENFORCE_GT(n, 0UL, platform::errors::InvalidArgument(
-                                "Input tensors count should be greater than 0, "
-                                "but received value is %d.",
-                                n));
+  PADDLE_ENFORCE_GT(n, 0UL,
+                    platform::errors::InvalidArgument(
+                        "Input tensors count should be greater than 0, "
+                        "but received value is %d.",
+                        n));
   if (n == 1) {
     LOG(WARNING) << "Only have one input, may waste memory";
   }
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
index c74cc504840..d29bc00b545 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
@@ -13,8 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernels.h"
 
 namespace paddle {
@@ -31,20 +33,23 @@ void FusionSeqPoolCVMConcatOp::InferShape(
       paddle::platform::errors::InvalidArgument(
           "Output(Out) of FusionSeqPoolCVMConcatOp should not be null."));
   int axis = ctx->Attrs().Get<int>("axis");
-  PADDLE_ENFORCE_EQ(axis, 1, paddle::platform::errors::InvalidArgument(
-                                 "FusionSeqPoolCVMConcatOp only supports "
-                                 "concat axis=1 yet, but received %d.",
-                                 axis));
+  PADDLE_ENFORCE_EQ(axis, 1,
+                    paddle::platform::errors::InvalidArgument(
+                        "FusionSeqPoolCVMConcatOp only supports "
+                        "concat axis=1 yet, but received %d.",
+                        axis));
   bool use_cvm = ctx->Attrs().Get<bool>("use_cvm");
-  PADDLE_ENFORCE_EQ(use_cvm, true, paddle::platform::errors::InvalidArgument(
-                                       "FusionSeqPoolCVMConcatOp only supports "
-                                       "use_cvm is true yet, but received %d.",
-                                       use_cvm));
+  PADDLE_ENFORCE_EQ(use_cvm, true,
+                    paddle::platform::errors::InvalidArgument(
+                        "FusionSeqPoolCVMConcatOp only supports "
+                        "use_cvm is true yet, but received %d.",
+                        use_cvm));
 
   auto ins_dims = ctx->GetInputsDim("X");
   const size_t n = ins_dims.size();
-  PADDLE_ENFORCE_GT(n, 0UL, paddle::platform::errors::InvalidArgument(
-                                "Input tensors count should > 0."));
+  PADDLE_ENFORCE_GT(n, 0UL,
+                    paddle::platform::errors::InvalidArgument(
+                        "Input tensors count should > 0."));
   if (n == 1) {
     LOG(WARNING) << "Only have one input, may waste memory";
   }
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index 870f72b8c7f..047fefc1eeb 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -13,8 +13,10 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernels.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
index 954cd7cc7a4..bf8e9818e54 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index 786f5b4e077..eb29859d8d1 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
index 66e6c00da2d..52140c0ca46 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
index 0ffc4c91b85..c9956dcdd20 100644
--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <initializer_list>
 #include <iostream>
 #include <memory>
+
 #include "dnnl.hpp"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/operator.h"
@@ -31,8 +32,8 @@ using paddle::platform::CPUDeviceContext;
 using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
-using platform::to_void_cast;
 using phi::vectorize;
+using platform::to_void_cast;
 using Direction = dnnl::rnn_direction;
 
 namespace {
diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc
index e7d697767fc..ad0cc0bd1cf 100644
--- a/paddle/fluid/operators/fused/multi_gru_op.cc
+++ b/paddle/fluid/operators/fused/multi_gru_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstring>  // for memcpy
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
diff --git a/paddle/fluid/operators/fused/multi_gru_op.h b/paddle/fluid/operators/fused/multi_gru_op.h
index ebd3faf44a8..8b064c8754f 100644
--- a/paddle/fluid/operators/fused/multi_gru_op.h
+++ b/paddle/fluid/operators/fused/multi_gru_op.h
@@ -19,9 +19,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using framework::ExecutionContext;
 using framework::LoDTensor;
 using framework::Tensor;
-using framework::ExecutionContext;
 
 class MultiGRUOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cc b/paddle/fluid/operators/fused/multihead_matmul_op.cc
index 8f2c04d5afe..79b886c3729 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cc
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/errors.h"
 
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu
index f0e05659c92..30155346716 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
@@ -105,8 +107,8 @@ void TransQKVWithBias(const int batch, const int seq_len, const int head_size,
                       platform::errors::InvalidArgument(
                           "head_num (%d) * head_size (%d) should <= %d",
                           head_num, head_size, 1024 * 4));
-    TransposeQkvKernel<float4><<<grid, block, 0, stream>>>(h, input4, bias4,
-                                                           output4);
+    TransposeQkvKernel<float4>
+        <<<grid, block, 0, stream>>>(h, input4, bias4, output4);
   } else if (head_size % 2 == 0 && scratch_size % 2 == 0) {
     const int h = head_size / 2;
     const float2 *input2 = reinterpret_cast<const float2 *>(input);
@@ -118,8 +120,8 @@ void TransQKVWithBias(const int batch, const int seq_len, const int head_size,
                       platform::errors::InvalidArgument(
                           "head_num (%d) * head_size (%d) should <= %d",
                           head_num, head_size, 1024 * 2));
-    TransposeQkvKernel<float2><<<grid, block, 0, stream>>>(h, input2, bias2,
-                                                           output2);
+    TransposeQkvKernel<float2>
+        <<<grid, block, 0, stream>>>(h, input2, bias2, output2);
   } else {
     const dim3 block(head_size, head_num, 1);
     // limit head_size * head_num to max block size(1024).
@@ -127,8 +129,8 @@ void TransQKVWithBias(const int batch, const int seq_len, const int head_size,
                       platform::errors::InvalidArgument(
                           "head_num (%d) * head_size (%d) should <= %d",
                           head_num, head_size, 1024));
-    TransposeQkvKernel<float><<<grid, block, 0, stream>>>(head_size, input,
-                                                          bias, output);
+    TransposeQkvKernel<float>
+        <<<grid, block, 0, stream>>>(head_size, input, bias, output);
   }
 }
 
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
index 6f4246aadd9..d5860fe9cf1 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
@@ -115,13 +115,14 @@ class ResNetUnitOp : public framework::OperatorWithKernel {
       bn_param_shape = {1, 1, 1, bn_param_shape[0]};
     }
     framework::DDim bn_param_dims = phi::make_ddim(bn_param_shape);
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4, platform::errors::InvalidArgument(
-                                            "The dimensions of input "
-                                            "must equal to 4."
-                                            "But received: the shape of input "
-                                            "= [%s], the dimension of input = "
-                                            "[%d]",
-                                            x_dims, x_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 4,
+        platform::errors::InvalidArgument("The dimensions of input "
+                                          "must equal to 4."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
     PADDLE_ENFORCE_EQ(w_dims.size(), 4,
                       platform::errors::InvalidArgument(
                           "The dimensions of filter "
@@ -180,14 +181,16 @@ class ResNetUnitOp : public framework::OperatorWithKernel {
     // and var tensors should be float when input tensor's dtype is float16.
     auto bn_param_type = framework::proto::VarType::FP32;
 
-    PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("ScaleX")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("BiasX")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("ScaleX")->dtype()),
+        platform::errors::InvalidArgument(
+            "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("BiasX")->dtype()),
+        platform::errors::InvalidArgument(
+            "Bias input should be of float type"));
     framework::LibraryType library = framework::LibraryType::kPlain;
     framework::DataLayout layout = framework::DataLayout::kAnyLayout;
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cc b/paddle/fluid/operators/fused/skip_layernorm_op.cc
index 442f359c0da..6ac6f51e4ce 100644
--- a/paddle/fluid/operators/fused/skip_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/errors.h"
 
diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cu b/paddle/fluid/operators/fused/skip_layernorm_op.cu
index e755ea33755..66a164ff31b 100644
--- a/paddle/fluid/operators/fused/skip_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cc b/paddle/fluid/operators/fused_softmax_mask_op.cc
index a4138002833..a33070d94b9 100644
--- a/paddle/fluid/operators/fused_softmax_mask_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/fused_softmax_mask_op.h"
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cu b/paddle/fluid/operators/fused_softmax_mask_op.cu
index c4ab4de8a64..b68a6907d7a 100644
--- a/paddle/fluid/operators/fused_softmax_mask_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_op.cu
@@ -40,6 +40,7 @@ limitations under the License. */
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
+
 #include <algorithm>
 #include <string>
 
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
index c737ba361e0..eefca7b6ab5 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
@@ -11,6 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.h"
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index d4c5b887705..4ee90eb3184 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -39,6 +39,7 @@ limitations under the License. */
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
+
 #include <algorithm>
 #include <string>
 
@@ -395,49 +396,49 @@ class SoftmaxMaskFuseUpperTriangleKernel : public framework::OpKernel<T> {
 
     switch (pow2_index) {
       case 5:  // 32
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 5><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 5>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 6:  // 64
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 6><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 6>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 7:  // 128
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 7><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 7>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 8:  // 256
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 8><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 8>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 9:  // 512
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 9><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 9>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 10:  // 1024
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 10><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                   key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 10>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 11:  // 2048
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 11><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                   key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 11>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 12:  // 4096
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 12><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                   key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 12>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       case 13:  // 8192
-        SoftmaxMaskFuseUpperTriangleGPUKernel<
-            T, 13><<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
-                                                   key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 13>
+            <<<blocks, threads, 0, stream>>>(x_data, y_data, batch_count,
+                                             key_seq_len);
         break;
       default:
         break;
@@ -483,58 +484,58 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
 
     switch (pow2_index) {
       case 5:  // 32
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 5><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                  softmax_rst_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 5>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 6:  // 64
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 6><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                  softmax_rst_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 6>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 7:  // 128
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 7><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                  softmax_rst_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 7>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 8:  // 256
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 8><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                  softmax_rst_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 8>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 9:  // 512
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 9><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                  softmax_rst_data, batch_count,
-                                                  key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 9>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 10:  // 1024
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 10><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                   softmax_rst_data,
-                                                   batch_count, key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 10>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 11:  // 2048
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 11><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                   softmax_rst_data,
-                                                   batch_count, key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 11>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 12:  // 4096
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 12><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                   softmax_rst_data,
-                                                   batch_count, key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 12>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       case 13:  // 8192
-        SoftmaxMaskFuseUpperTriangleGradGPUKernel<
-            T, 13><<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
-                                                   softmax_rst_data,
-                                                   batch_count, key_seq_len);
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 13>
+            <<<blocks, threads, 0, stream>>>(grad_y_data, grad_x_data,
+                                             softmax_rst_data, batch_count,
+                                             key_seq_len);
         break;
       default:
         break;
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 9f2b48a24b4..d44dd324d6c 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -153,7 +153,7 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
                   ops::GatherGradNoNeedBufferVarInferer,
                   GatherGradInferShapeFunctor);
 
-REGISTER_OP_VERSION(gather)
-    .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
-                   paddle::framework::compatible::OpVersionDesc().NewInput(
-                       "Axis", "Specify the axis of gather operation."));
+REGISTER_OP_VERSION(gather).AddCheckpoint(
+    R"ROC(upgrad gather, add a new input [Axis])ROC",
+    paddle::framework::compatible::OpVersionDesc().NewInput(
+        "Axis", "Specify the axis of gather operation."));
diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc
index 9dd8f58d242..327eec2a6ca 100644
--- a/paddle/fluid/operators/gather_op_xpu.cc
+++ b/paddle/fluid/operators/gather_op_xpu.cc
@@ -168,10 +168,11 @@ class GatherGradOpXPUKernel : public framework::OpKernel<T> {
       r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
                                          index->data<int64_t>(),
                                          index_int_ptr_l3, index->numel());
-      PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                            "XPU API(cast_v2) return wrong "
-                                            "value[%d %s]",
-                                            r, XPUAPIErrorMsg[r]));
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU API(cast_v2) return wrong "
+                                     "value[%d %s]",
+                                     r, XPUAPIErrorMsg[r]));
 
       r = xpu::gather_grad<XPUType, int>(
           dev_ctx.x_context(),
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu
index f97eb3d5e9d..6c4a7a01f3f 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cu
+++ b/paddle/fluid/operators/gather_scatter_kernel.cu
@@ -132,10 +132,11 @@ struct gpu_gather_scatter_functor {
     int64_t grid = (n + block - 1) / block;
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    GatherScatterGPUKernel<tensor_t, index_t, func_t,
-                           is_scatter_like><<<grid, block, 0, stream>>>(
-        self_data, dim, index_data, src_data, inner_dim_size, select_dim_size,
-        replaced_select_dim_size, outer_dim_size, index_size, reduce_op);
+    GatherScatterGPUKernel<tensor_t, index_t, func_t, is_scatter_like>
+        <<<grid, block, 0, stream>>>(self_data, dim, index_data, src_data,
+                                     inner_dim_size, select_dim_size,
+                                     replaced_select_dim_size, outer_dim_size,
+                                     index_size, reduce_op);
   }
 };  // struct gpu_gather_scatter_functor
 
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index c962dd06523..676143bf011 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/funcs/gather.h"
+
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/kernels/funcs/gather.h"
 
 TEST(Gather, GatherData) {
   paddle::framework::Tensor* src = new paddle::framework::Tensor();
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index deac932d59b..1e89091b202 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/gaussian_random_op_xpu.cc b/paddle/fluid/operators/gaussian_random_op_xpu.cc
index 5a1ac46f615..2ffc90fbd8c 100644
--- a/paddle/fluid/operators/gaussian_random_op_xpu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_xpu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include <random>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc
index 3be2606bfc9..080ceaa45e3 100644
--- a/paddle/fluid/operators/gelu_op.cc
+++ b/paddle/fluid/operators/gelu_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/gelu_op_xpu.cc b/paddle/fluid/operators/gelu_op_xpu.cc
index 559d2448ad9..408638f7d2c 100644
--- a/paddle/fluid/operators/gelu_op_xpu.cc
+++ b/paddle/fluid/operators/gelu_op_xpu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cc b/paddle/fluid/operators/graph_khop_sampler_op.cc
index c83ee258406..edf7d20c6d5 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cc
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cc
@@ -19,10 +19,11 @@ namespace operators {
 
 void InputShapeCheck(const framework::DDim& dims, std::string tensor_name) {
   if (dims.size() == 2) {
-    PADDLE_ENFORCE_EQ(dims[1], 1, platform::errors::InvalidArgument(
-                                      "The last dim of %s should be 1 when it "
-                                      "is 2D, but we get %d",
-                                      tensor_name, dims[1]));
+    PADDLE_ENFORCE_EQ(dims[1], 1,
+                      platform::errors::InvalidArgument(
+                          "The last dim of %s should be 1 when it "
+                          "is 2D, but we get %d",
+                          tensor_name, dims[1]));
   } else {
     PADDLE_ENFORCE_EQ(
         dims.size(), 1,
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index df977b43512..a63fdc89e24 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -26,6 +26,7 @@ limitations under the License. */
 #include <thrust/sort.h>
 #include <thrust/transform.h>
 #include <thrust/unique.h>
+
 #include <ostream>
 
 #ifdef PADDLE_WITH_HIP
@@ -217,15 +218,16 @@ void SampleNeighbors(const framework::ExecutionContext& ctx, const T* src,
   constexpr int TILE_SIZE = BLOCK_WARPS * 16;
   const dim3 block(WARP_SIZE, BLOCK_WARPS);
   const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE);
-  GraphSampleNeighborsCUDAKernel<T, BLOCK_WARPS, TILE_SIZE><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx.device_context())
-          .stream()>>>(
-      0, k, bs, thrust::raw_pointer_cast(inputs->data()), src, dst_count,
-      src_eids, thrust::raw_pointer_cast(outputs->data()),
-      thrust::raw_pointer_cast(outputs_eids->data()),
-      thrust::raw_pointer_cast(output_ptr.data()),
-      thrust::raw_pointer_cast(output_idxs.data()), return_eids);
+  GraphSampleNeighborsCUDAKernel<T, BLOCK_WARPS, TILE_SIZE>
+      <<<grid, block, 0,
+         reinterpret_cast<const platform::CUDADeviceContext&>(
+             ctx.device_context())
+             .stream()>>>(
+          0, k, bs, thrust::raw_pointer_cast(inputs->data()), src, dst_count,
+          src_eids, thrust::raw_pointer_cast(outputs->data()),
+          thrust::raw_pointer_cast(outputs_eids->data()),
+          thrust::raw_pointer_cast(output_ptr.data()),
+          thrust::raw_pointer_cast(output_idxs.data()), return_eids);
 
   // 5. Get inputs = outputs - inputs:
   if (!is_last_layer) {
@@ -264,19 +266,19 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
   int grid_tmp = (num_input + block - 1) / block;
   int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   // 1. Insert data into keys and values.
-  BuildHashTable<
-      T><<<grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                               ctx.device_context())
-                               .stream()>>>(
+  BuildHashTable<T><<<grid, block, 0,
+                      reinterpret_cast<const platform::CUDADeviceContext&>(
+                          ctx.device_context())
+                          .stream()>>>(
       input, num_input, len_hashtable, thrust::raw_pointer_cast(keys->data()),
       thrust::raw_pointer_cast(key_index->data()));
 
   // 2. Get item index count.
   thrust::device_vector<int> item_count(num_input + 1, 0);
-  GetItemIndexCount<
-      T><<<grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                               ctx.device_context())
-                               .stream()>>>(
+  GetItemIndexCount<T><<<grid, block, 0,
+                         reinterpret_cast<const platform::CUDADeviceContext&>(
+                             ctx.device_context())
+                             .stream()>>>(
       input, thrust::raw_pointer_cast(item_count.data()), num_input,
       len_hashtable, thrust::raw_pointer_cast(keys->data()),
       thrust::raw_pointer_cast(key_index->data()));
@@ -287,16 +289,16 @@ void FillHashTable(const framework::ExecutionContext& ctx, const T* input,
   unique_items->resize(total_unique_items);
 
   // 3. Get unique items.
-  FillUniqueItems<
-      T><<<grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                               ctx.device_context())
-                               .stream()>>>(
-      input, num_input, len_hashtable,
-      thrust::raw_pointer_cast(unique_items->data()),
-      thrust::raw_pointer_cast(item_count.data()),
-      thrust::raw_pointer_cast(keys->data()),
-      thrust::raw_pointer_cast(values->data()),
-      thrust::raw_pointer_cast(key_index->data()));
+  FillUniqueItems<T>
+      <<<grid, block, 0,
+         reinterpret_cast<const platform::CUDADeviceContext&>(
+             ctx.device_context())
+             .stream()>>>(input, num_input, len_hashtable,
+                          thrust::raw_pointer_cast(unique_items->data()),
+                          thrust::raw_pointer_cast(item_count.data()),
+                          thrust::raw_pointer_cast(keys->data()),
+                          thrust::raw_pointer_cast(values->data()),
+                          thrust::raw_pointer_cast(key_index->data()));
 }
 
 template <typename T>
@@ -337,23 +339,23 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
   int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid_tmp = (outputs->size() + block - 1) / block;
   int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
-  ReindexSrcOutput<
-      T><<<grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                               ctx.device_context())
-                               .stream()>>>(
+  ReindexSrcOutput<T><<<grid, block, 0,
+                        reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream()>>>(
       thrust::raw_pointer_cast(outputs->data()), outputs->size(), size,
       thrust::raw_pointer_cast(keys.data()),
       thrust::raw_pointer_cast(values.data()));
 
   int grid_ = (bs + block - 1) / block;
-  ReindexInputNodes<T><<<grid_, block, 0,
-                         reinterpret_cast<const platform::CUDADeviceContext&>(
-                             ctx.device_context())
-                             .stream()>>>(
-      thrust::raw_pointer_cast(orig_nodes->data()), bs,
-      thrust::raw_pointer_cast(reindex_nodes->data()), size,
-      thrust::raw_pointer_cast(keys.data()),
-      thrust::raw_pointer_cast(values.data()));
+  ReindexInputNodes<T>
+      <<<grid_, block, 0,
+         reinterpret_cast<const platform::CUDADeviceContext&>(
+             ctx.device_context())
+             .stream()>>>(thrust::raw_pointer_cast(orig_nodes->data()), bs,
+                          thrust::raw_pointer_cast(reindex_nodes->data()), size,
+                          thrust::raw_pointer_cast(keys.data()),
+                          thrust::raw_pointer_cast(values.data()));
 }
 
 template <typename DeviceContext, typename T>
@@ -532,15 +534,16 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
     const dim3 block(WARP_SIZE, BLOCK_WARPS);
     const dim3 grid((unique_dst_size + TILE_SIZE - 1) / TILE_SIZE);
 
-    GetDstEdgeCUDAKernel<T, BLOCK_WARPS, TILE_SIZE><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(
-        unique_dst_size,
-        thrust::raw_pointer_cast(unique_dst_merge_reindex.data()),
-        thrust::raw_pointer_cast(dst_sample_counts_merge.data()),
-        thrust::raw_pointer_cast(dst_ptr.data()),
-        thrust::raw_pointer_cast(dst_merge.data()));
+    GetDstEdgeCUDAKernel<T, BLOCK_WARPS, TILE_SIZE>
+        <<<grid, block, 0,
+           reinterpret_cast<const platform::CUDADeviceContext&>(
+               ctx.device_context())
+               .stream()>>>(
+            unique_dst_size,
+            thrust::raw_pointer_cast(unique_dst_merge_reindex.data()),
+            thrust::raw_pointer_cast(dst_sample_counts_merge.data()),
+            thrust::raw_pointer_cast(dst_ptr.data()),
+            thrust::raw_pointer_cast(dst_merge.data()));
 
     // 8. Give operator's outputs.
     auto* out_src = ctx.Output<Tensor>("Out_Src");
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.h b/paddle/fluid/operators/graph_khop_sampler_op.h
index d7121cb5493..1005a6ab11c 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.h
+++ b/paddle/fluid/operators/graph_khop_sampler_op.h
@@ -15,10 +15,12 @@ limitations under the License. */
 #pragma once
 
 #include <stdlib.h>
+
 #include <numeric>
 #include <random>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index f6a1e20a1a1..4d989ed1f2e 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/group_norm_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index bb8031b0cc4..84eb2fbc7d3 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -322,9 +322,9 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
         ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
             x_data, mean_data, temp_var_data, size);
       } else {
-        VectorizedGetMeanAndVarNCHW<
-            T, AccT, vec_size><<<grids, blocks, 0, dev_ctx.stream()>>>(
-            x_data, mean_data, temp_var_data, size);
+        VectorizedGetMeanAndVarNCHW<T, AccT, vec_size>
+            <<<grids, blocks, 0, dev_ctx.stream()>>>(x_data, mean_data,
+                                                     temp_var_data, size);
       }
     } else {
       set_zero(dev_ctx, mean, static_cast<T>(0));
@@ -613,16 +613,16 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
       }
       block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
       dim3 blocks(block_size_nchw);
-      ScalarGetDsDbCUDAKernel<
-          T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
-          imsize, x_data, dy_data, ds_data, db_data);
+      ScalarGetDsDbCUDAKernel<T>
+          <<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
+              imsize, x_data, dy_data, ds_data, db_data);
 
       if (d_scale || d_bias) {
         const int block = 256;
-        GetScaleBiasGradientCUDAKernel<
-            T><<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>(
-            x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data,
-            db_data, d_scale_data, d_bias_data);
+        GetScaleBiasGradientCUDAKernel<T>
+            <<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>(
+                x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data,
+                db_data, d_scale_data, d_bias_data);
       }
 
       if (d_x_data != nullptr) {
@@ -639,10 +639,10 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
         T* p2_data = p2.data<T>();
         T* p3_data = p3.data<T>();
 
-        GetBackwardParamsCUDAKernel<T, block_dims><<<
-            dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>(
-            imsize, groups, group_size, epsilon, mean_data, var_data,
-            scale_data, ds_data, db_data, p1_data, p2_data, p3_data);
+        GetBackwardParamsCUDAKernel<T, block_dims>
+            <<<dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>(
+                imsize, groups, group_size, epsilon, mean_data, var_data,
+                scale_data, ds_data, db_data, p1_data, p2_data, p3_data);
         GetXGradientCUDAKernel<T><<<grid, threads, 0, dev_ctx.stream()>>>(
             imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data,
             dy_data, d_x_data);
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
index 2d80ab89471..28a3ad2a8e1 100644
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <array>
 #include <numeric>
 #include <string>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc
index 8de8647186e..dfc509941bc 100644
--- a/paddle/fluid/operators/group_norm_op_npu.cc
+++ b/paddle/fluid/operators/group_norm_op_npu.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/group_norm_op.h"
 #include <vector>
+
+#include "paddle/fluid/operators/group_norm_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 58cbdfda347..21ad5914c5d 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
+
 #include <memory>
 #include <string>
+
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h"
 #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 852655034c8..4cc6c65983f 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index 8998c51f0df..b6d9ef50f83 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/gru_unit_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 291f5f4ad26..2dd1515919b 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -77,9 +77,9 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     // calculate unactivated gate outputs
     if (bias) {
       auto b = framework::EigenMatrix<T>::From(*bias);
-      g.device(place) = x +
-                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
-                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
+      g.device(place) =
+          x + b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
+                  .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
     } else {
       g.device(place) = x;
     }
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index cce80518354..f72fe9282ab 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/hinge_loss_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/huber_loss_op_xpu.cc b/paddle/fluid/operators/huber_loss_op_xpu.cc
index ccddec27795..2fafd186215 100644
--- a/paddle/fluid/operators/huber_loss_op_xpu.cc
+++ b/paddle/fluid/operators/huber_loss_op_xpu.cc
@@ -39,10 +39,11 @@ class HuberLossXPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::XPUDeviceContext>();
     int r = xpu::huber_loss<T>(dev_ctx.x_context(), in0_data, in1_data,
                                residual_data, out_data, in0->numel(), 1, delta);
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                          "XPU API(huber_loss) return wrong "
-                                          "value[%d %s]",
-                                          r, XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(huber_loss) return wrong "
+                                   "value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
   }
 };
 
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index d248857b8f4..107384742bb 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/im2sequence_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index b0c4b9b4a99..218161fd00a 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h
index bb26e2f445e..d8417e42e1b 100644
--- a/paddle/fluid/operators/index_impl.cu.h
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/random.h>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -73,16 +74,16 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
   size_t main_offset = (numel / (vec_size * block)) * vec_size * block;
   switch (vec_size) {
     case 4:
-      VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
-          out_data, numel, main_offset, func);
+      VectorizedIndexKernel<T, Functor, 4>
+          <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     case 2:
-      VectorizedIndexKernel<T, Functor, 2><<<grid, block, 0, stream>>>(
-          out_data, numel, main_offset, func);
+      VectorizedIndexKernel<T, Functor, 2>
+          <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     case 1:
-      VectorizedIndexKernel<T, Functor, 1><<<grid, block, 0, stream>>>(
-          out_data, numel, main_offset, func);
+      VectorizedIndexKernel<T, Functor, 1>
+          <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     default: {
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc
index d17c6368c75..15fc0f6d14f 100644
--- a/paddle/fluid/operators/index_sample_op.cc
+++ b/paddle/fluid/operators/index_sample_op.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <vector>
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/platform/enforce.h"
 
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/binary.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index 684829be269..c82aaab0fe1 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index d420d0319bf..6cb8d664d80 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/inplace_abn_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/phi/kernels/batch_norm_grad_kernel.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
@@ -38,18 +40,21 @@ class InplaceABNOp : public paddle::operators::BatchNormOp {
     if (input_data_type == framework::proto::VarType::FP64) {
       bn_param_type = framework::proto::VarType::FP64;
     }
-    PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("Scale")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("Bias")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Bias input should be of float type"));
-    PADDLE_ENFORCE_EQ(bn_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("Mean")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Mean input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Scale")->dtype()),
+        platform::errors::InvalidArgument(
+            "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Bias")->dtype()),
+        platform::errors::InvalidArgument(
+            "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Mean")->dtype()),
+        platform::errors::InvalidArgument(
+            "Mean input should be of float type"));
     PADDLE_ENFORCE_EQ(
         bn_param_type,
         framework::TransToProtoVarType(ctx.Input<Tensor>("Variance")->dtype()),
@@ -209,8 +214,9 @@ class InplaceABNKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<Tensor>("X");
     auto* y = ctx.Output<Tensor>("Y");
-    PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
-                                "X and Y not inplaced in inplace mode"));
+    PADDLE_ENFORCE_EQ(x, y,
+                      platform::errors::InvalidArgument(
+                          "X and Y not inplaced in inplace mode"));
     auto activation =
         GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index 6476023fcd2..7245629e565 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -28,8 +28,9 @@ class InplaceABNKernel
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* y = ctx.Output<Tensor>("Y");
     auto* x = ctx.Input<Tensor>("X");
-    PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
-                                "X and Y not inplaced in inplace mode"));
+    PADDLE_ENFORCE_EQ(x, y,
+                      platform::errors::InvalidArgument(
+                          "X and Y not inplaced in inplace mode"));
     auto activation =
         GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
diff --git a/paddle/fluid/operators/inplace_abn_op.h b/paddle/fluid/operators/inplace_abn_op.h
index 94240497858..275209911d1 100644
--- a/paddle/fluid/operators/inplace_abn_op.h
+++ b/paddle/fluid/operators/inplace_abn_op.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index de92de453a3..21ccf777051 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/instance_norm_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -38,16 +40,18 @@ framework::OpKernelType InstanceNormOp::GetExpectedKernelType(
     in_param_type = framework::proto::VarType::FP64;
   }
   if (ctx.HasInput("Scale")) {
-    PADDLE_ENFORCE_EQ(in_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("Scale")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        in_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Scale")->dtype()),
+        platform::errors::InvalidArgument(
+            "Scale input should be of float type"));
   }
   if (ctx.HasInput("Bias")) {
-    PADDLE_ENFORCE_EQ(in_param_type, framework::TransToProtoVarType(
-                                         ctx.Input<Tensor>("Bias")->dtype()),
-                      platform::errors::InvalidArgument(
-                          "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        in_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Bias")->dtype()),
+        platform::errors::InvalidArgument(
+            "Bias input should be of float type"));
   }
 
   return framework::OpKernelType(input_data_type, ctx.GetPlace());
diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h
index 265e4acef0d..3f99cdf10c6 100644
--- a/paddle/fluid/operators/instance_norm_op.h
+++ b/paddle/fluid/operators/instance_norm_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index fda168c94e1..3c746d7c08a 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -10,9 +10,11 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/interpolate_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -112,11 +114,12 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
 
   PADDLE_ENFORCE_EQ("bilinear" == interp_method || "nearest" == interp_method ||
                         "bicubic" == interp_method,
-                    true, platform::errors::InvalidArgument(
-                              "Interpolation method can only be \"bilinear\" "
-                              "or \"nearest\" or \"bicubic\" when "
-                              "Input(X) dimension is 4, but got method is %s.",
-                              interp_method));
+                    true,
+                    platform::errors::InvalidArgument(
+                        "Interpolation method can only be \"bilinear\" "
+                        "or \"nearest\" or \"bicubic\" when "
+                        "Input(X) dimension is 4, but got method is %s.",
+                        interp_method));
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
 
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 8a63c9a3946..729eba43d72 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <string>
+
 #include "paddle/fluid/operators/interpolate_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -860,9 +861,10 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
       out_w = size_data[0];
     }
   }
-  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                  "out_w in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0,
+                    platform::errors::InvalidArgument(
+                        "out_w in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
   framework::DDim dim_out;
   if (data_layout == DataLayout::kNCHW) {
     dim_out = {n, c, out_w};
@@ -942,12 +944,14 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
       out_w = size_data[1];
     }
   }
-  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
-                                  "out_h in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
-  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                  "out_w in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0,
+                    platform::errors::InvalidArgument(
+                        "out_h in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0,
+                    platform::errors::InvalidArgument(
+                        "out_w in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
 
   framework::DDim dim_out;
   if (data_layout == DataLayout::kNCHW) {
@@ -984,21 +988,21 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
       platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpFw<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+    KeNearestNeighborInterpFw<T>
+        <<<config.block_per_grid, config.thread_per_block, 0,
+           ctx.cuda_device_context().stream()>>>(
+            input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+            out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
     KeBilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
   } else if ("bicubic" == interp_method) {
-    KeBicubicInterpFw<T><<<config.block_per_grid, 512, 0,
-                           ctx.cuda_device_context().stream()>>>(
-        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
-        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+    KeBicubicInterpFw<T>
+        <<<config.block_per_grid, 512, 0, ctx.cuda_device_context().stream()>>>(
+            input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
+            out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   }
 }
 
@@ -1051,15 +1055,18 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
       out_w = size_data[2];
     }
   }
-  PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument(
-                                  "out_d in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
-  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
-                                  "out_h in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
-  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                  "out_w in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_d, 0,
+                    platform::errors::InvalidArgument(
+                        "out_d in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0,
+                    platform::errors::InvalidArgument(
+                        "out_h in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0,
+                    platform::errors::InvalidArgument(
+                        "out_w in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
 
   framework::DDim dim_out;
   if (data_layout == DataLayout::kNCHW) {
@@ -1271,11 +1278,11 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
       platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpBw<
-        T><<<config.block_per_grid, config.thread_per_block, 0,
-             ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+    KeNearestNeighborInterpBw<T>
+        <<<config.block_per_grid, config.thread_per_block, 0,
+           ctx.cuda_device_context().stream()>>>(
+            input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
+            out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
     KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
@@ -1283,10 +1290,10 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
         n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
         data_layout);
   } else if ("bicubic" == interp_method) {
-    KeBicubicInterpBw<T><<<config.block_per_grid, 512, 0,
-                           ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
+    KeBicubicInterpBw<T>
+        <<<config.block_per_grid, 512, 0, ctx.cuda_device_context().stream()>>>(
+            input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h,
+            out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   }
 }
 
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 57b5eb553cc..18caed22b48 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -13,6 +13,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -808,9 +809,10 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
       out_w = out_size_data[0];
     }
   }
-  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                  "out_w in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0,
+                    platform::errors::InvalidArgument(
+                        "out_w in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
   framework::DDim dim_out;
   if (data_layout == DataLayout::kNCHW) {
     dim_out = {n, c, out_w};
@@ -876,12 +878,14 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
       out_w = out_size_data[1];
     }
   }
-  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
-                                  "out_h in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
-  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                  "out_w in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0,
+                    platform::errors::InvalidArgument(
+                        "out_h in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0,
+                    platform::errors::InvalidArgument(
+                        "out_w in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
   framework::DDim dim_out;
   if (data_layout == DataLayout::kNCHW) {
     dim_out = {n, c, out_h, out_w};
@@ -964,15 +968,18 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
       out_w = out_size_data[2];
     }
   }
-  PADDLE_ENFORCE_GT(out_d, 0, platform::errors::InvalidArgument(
-                                  "out_d in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
-  PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
-                                  "out_h in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
-  PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                  "out_w in Attr(out_shape) of Op(interpolate) "
-                                  "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_d, 0,
+                    platform::errors::InvalidArgument(
+                        "out_d in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_h, 0,
+                    platform::errors::InvalidArgument(
+                        "out_h in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
+  PADDLE_ENFORCE_GT(out_w, 0,
+                    platform::errors::InvalidArgument(
+                        "out_w in Attr(out_shape) of Op(interpolate) "
+                        "should be greater than 0."));
 
   framework::DDim dim_out;
   if (data_layout == DataLayout::kNCHW) {
diff --git a/paddle/fluid/operators/interpolate_op_npu.cc b/paddle/fluid/operators/interpolate_op_npu.cc
old mode 100755
new mode 100644
index f83f149b87c..0cbac393af5
--- a/paddle/fluid/operators/interpolate_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_op_npu.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/interpolate_op.h"
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/operators/interpolate_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/interpolate_op_xpu.cc b/paddle/fluid/operators/interpolate_op_xpu.cc
index 9576dc84524..09780505ac2 100644
--- a/paddle/fluid/operators/interpolate_op_xpu.cc
+++ b/paddle/fluid/operators/interpolate_op_xpu.cc
@@ -111,14 +111,16 @@ class InterpolateXPUKernel : public framework::OpKernel<T> {
         out_w = out_size_data[1];
       }
     }
-    PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
-                                    "out_h in Attr(out_shape) of "
-                                    "Op(interpolate) "
-                                    "should be greater than 0."));
-    PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                    "out_w in Attr(out_shape) of "
-                                    "Op(interpolate) "
-                                    "should be greater than 0."));
+    PADDLE_ENFORCE_GT(
+        out_h, 0,
+        platform::errors::InvalidArgument("out_h in Attr(out_shape) of "
+                                          "Op(interpolate) "
+                                          "should be greater than 0."));
+    PADDLE_ENFORCE_GT(
+        out_w, 0,
+        platform::errors::InvalidArgument("out_w in Attr(out_shape) of "
+                                          "Op(interpolate) "
+                                          "should be greater than 0."));
     framework::DDim dim_out;
     if (data_layout == DataLayout::kNCHW) {
       dim_out = {n, c, out_h, out_w};
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index d0d7b7694fc..6bac35ee1d4 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -40,10 +40,11 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
   for (int i = 0; i < dim_x.size(); ++i) {
-    PADDLE_ENFORCE_NE(dim_x[i], 0, platform::errors::InvalidArgument(
-                                       "The shape of input(x) should be larged "
-                                       "than 0, bug received shape[%d] is %d ",
-                                       i, dim_x[i]));
+    PADDLE_ENFORCE_NE(dim_x[i], 0,
+                      platform::errors::InvalidArgument(
+                          "The shape of input(x) should be larged "
+                          "than 0, bug received shape[%d] is %d ",
+                          i, dim_x[i]));
   }
   if (ctx->HasInputs("SizeTensor")) {
     // top prority size
@@ -144,10 +145,11 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
       ctx->Attrs().Get<std::string>("data_layout"));
 
   for (int i = 0; i < dim_x.size(); ++i) {
-    PADDLE_ENFORCE_NE(dim_x[i], 0, platform::errors::InvalidArgument(
-                                       "The shape of input(x) should be larged "
-                                       "than 0, bug received shape[%d] is %d ",
-                                       i, dim_x[i]));
+    PADDLE_ENFORCE_NE(dim_x[i], 0,
+                      platform::errors::InvalidArgument(
+                          "The shape of input(x) should be larged "
+                          "than 0, bug received shape[%d] is %d ",
+                          i, dim_x[i]));
   }
 
   if (ctx->HasInputs("SizeTensor")) {
@@ -263,10 +265,11 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
       ctx->Attrs().Get<std::string>("data_layout"));
 
   for (int i = 0; i < dim_x.size(); ++i) {
-    PADDLE_ENFORCE_NE(dim_x[i], 0, platform::errors::InvalidArgument(
-                                       "The shape of input(x) should be larged "
-                                       "than 0, bug received shape[%d] is %d ",
-                                       i, dim_x[i]));
+    PADDLE_ENFORCE_NE(dim_x[i], 0,
+                      platform::errors::InvalidArgument(
+                          "The shape of input(x) should be larged "
+                          "than 0, bug received shape[%d] is %d ",
+                          i, dim_x[i]));
   }
 
   if (ctx->HasInputs("SizeTensor")) {
diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc
index 615b5ea142b..97f39aa4902 100644
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
-
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/interpolate_v2_op_xpu.cc b/paddle/fluid/operators/interpolate_v2_op_xpu.cc
index 9cbfc951583..9d52c9a865e 100644
--- a/paddle/fluid/operators/interpolate_v2_op_xpu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_xpu.cc
@@ -114,14 +114,16 @@ class InterpolateV2XPUKernel : public framework::OpKernel<T> {
         out_w = out_size_data[1];
       }
     }
-    PADDLE_ENFORCE_GT(out_h, 0, platform::errors::InvalidArgument(
-                                    "out_h in Attr(out_shape) of "
-                                    "Op(interpolate) "
-                                    "should be greater than 0."));
-    PADDLE_ENFORCE_GT(out_w, 0, platform::errors::InvalidArgument(
-                                    "out_w in Attr(out_shape) of "
-                                    "Op(interpolate) "
-                                    "should be greater than 0."));
+    PADDLE_ENFORCE_GT(
+        out_h, 0,
+        platform::errors::InvalidArgument("out_h in Attr(out_shape) of "
+                                          "Op(interpolate) "
+                                          "should be greater than 0."));
+    PADDLE_ENFORCE_GT(
+        out_w, 0,
+        platform::errors::InvalidArgument("out_w in Attr(out_shape) of "
+                                          "Op(interpolate) "
+                                          "should be greater than 0."));
     framework::DDim dim_out;
     if (data_layout == DataLayout::kNCHW) {
       dim_out = {n, c, out_h, out_w};
diff --git a/paddle/fluid/operators/inverse_op.cc b/paddle/fluid/operators/inverse_op.cc
index f5b817a0e11..c4f3fbb2ca7 100644
--- a/paddle/fluid/operators/inverse_op.cc
+++ b/paddle/fluid/operators/inverse_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/inverse_op.h"
+
 #include <string>
 #include <unordered_map>
 
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 2e770f98525..456c1c2d44f 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -120,15 +120,16 @@ namespace ops = paddle::operators;
       paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, \
       paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
 
-#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)                      \
-  REGISTER_OP_CPU_KERNEL(                                                   \
-      op_type, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int, \
-                                   ops::functor>,                           \
-      ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,      \
-                          ops::functor>,                                    \
-      ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,        \
-                          ops::functor>,                                    \
-      ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,       \
+#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)                 \
+  REGISTER_OP_CPU_KERNEL(                                              \
+      op_type,                                                         \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,     \
+                          ops::functor>,                               \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t, \
+                          ops::functor>,                               \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,   \
+                          ops::functor>,                               \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,  \
                           ops::functor>);
 
 REGISTER_OP_MAKER(isinf, "isinf(X)");
diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu
index e233e371364..d1437d5b44d 100644
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
@@ -18,8 +18,9 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
-    isinf, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
-                               ops::InfinityFunctor>,
+    isinf,
+    ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
+                        ops::InfinityFunctor>,
     ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,
                         ops::InfinityFunctor>,
     ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,
@@ -38,8 +39,9 @@ REGISTER_OP_CUDA_KERNEL(isnan,
                                             plat::float16, ops::NANFunctor>);
 
 REGISTER_OP_CUDA_KERNEL(
-    isfinite, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
-                                  ops::IsfiniteFunctor>,
+    isfinite,
+    ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
+                        ops::IsfiniteFunctor>,
     ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,
                         ops::IsfiniteFunctor>,
     ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index a8e441a9671..3103a286772 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -324,8 +324,9 @@ void BenchKernelSgd() {
             "than n-1 (Sgd size -1). But upper - lower is %d and n-1 is %d.",
             static_cast<size_t>(upper - lower), (n - 1)));
     PADDLE_ENFORCE_GT(
-        n, 0, paddle::platform::errors::InvalidArgument(
-                  "The Sgd size should be larger than 0. But the n is %d.", n));
+        n, 0,
+        paddle::platform::errors::InvalidArgument(
+            "The Sgd size should be larger than 0. But the n is %d.", n));
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc
index 677e9979399..5a73e3c56d5 100644
--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
@@ -122,9 +122,8 @@ bool VTanhCreator::CanBeUsed(const int& d) const {
 }
 
 size_t VReluCreator::CodeSize(const int& d) const {
-  return 96 /* init size */ +
-         (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
-             8 /* average bytes for each instruction */;
+  return 96 /* init size */ + (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
+                                  8 /* average bytes for each instruction */;
 }
 
 size_t VSquareCreator::CodeSize(const int& d) const {
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index bd84368a573..24434c5993b 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <type_traits>
+
 #include "paddle/fluid/operators/jit/gen_base.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc
index 3b2139c9ed0..9c859229c5a 100644
--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -122,20 +122,23 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
   std::unique_ptr<GenBase> CreateJitCode(
       const matmul_attr_t& attr) const override {
     PADDLE_ENFORCE_GT(
-        attr.m, 0, platform::errors::InvalidArgument(
-                       "The attribute m (first matrix's row) of MatMul should "
-                       "be larger than 0. But it is %d.",
-                       attr.m));
+        attr.m, 0,
+        platform::errors::InvalidArgument(
+            "The attribute m (first matrix's row) of MatMul should "
+            "be larger than 0. But it is %d.",
+            attr.m));
     PADDLE_ENFORCE_GT(
-        attr.n, 0, platform::errors::InvalidArgument(
-                       "The attribute n (first matrix's col) of MatMul should "
-                       "be larger than 0. But it is %d.",
-                       attr.n));
+        attr.n, 0,
+        platform::errors::InvalidArgument(
+            "The attribute n (first matrix's col) of MatMul should "
+            "be larger than 0. But it is %d.",
+            attr.n));
     PADDLE_ENFORCE_GT(
-        attr.k, 0, platform::errors::InvalidArgument(
-                       "The attribute k (second matrix's col) of MatMul should "
-                       "be larger than 0. But it is %d.",
-                       attr.k));
+        attr.k, 0,
+        platform::errors::InvalidArgument(
+            "The attribute k (second matrix's col) of MatMul should "
+            "be larger than 0. But it is %d.",
+            attr.k));
     return make_unique<MatMulJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
index eb7328d7e06..af626326340 100644
--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <stdlib.h>  // for malloc and free
+
 #include <string>
 #include <vector>
 
@@ -33,10 +34,11 @@ class MatMulJitCode : public JitCode {
                          size_t code_size = 256 * 1024,
                          void* code_ptr = nullptr)
       : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
-    PADDLE_ENFORCE_EQ(m_, 1, platform::errors::Unimplemented(
-                                 "Jitcode of matmul only support m==1 (first "
-                                 "matrix's row) now. But m is %d.",
-                                 m_));
+    PADDLE_ENFORCE_EQ(m_, 1,
+                      platform::errors::Unimplemented(
+                          "Jitcode of matmul only support m==1 (first "
+                          "matrix's row) now. But m is %d.",
+                          m_));
     this->genCode();
   }
 
diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc
index 52fdf04f3f6..4788050a14c 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@@ -62,22 +62,23 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
     return platform::MayIUse(platform::avx);
   }
   size_t CodeSize(const seq_pool_attr_t& attr) const override {
-    return 96 +
-           ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
-                4 /* load, mul and save */ +
-            256) *
-               16;
+    return 96 + ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
+                     4 /* load, mul and save */
+                 + 256) *
+                    16;
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.w, 0, platform::errors::InvalidArgument(
-                                     "The attribute width of SeqPool should "
-                                     "be larger than 0. But it is %d.",
-                                     attr.w));
-    PADDLE_ENFORCE_GT(attr.h, 0, platform::errors::InvalidArgument(
-                                     "The attribute height of SeqPool should "
-                                     "be larger than 0. But it is %d.",
-                                     attr.h));
+    PADDLE_ENFORCE_GT(attr.w, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute width of SeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.w));
+    PADDLE_ENFORCE_GT(attr.h, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute height of SeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.h));
     return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
index 5baafa11cfe..2a3c347c16a 100644
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/operators/jit/gen_base.h"
 
 #include <fstream>
+
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"  // for posix_memalign
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/jit/gen_base.h b/paddle/fluid/operators/jit/gen_base.h
index c22a7f3ec92..761c52b7d7c 100644
--- a/paddle/fluid/operators/jit/gen_base.h
+++ b/paddle/fluid/operators/jit/gen_base.h
@@ -17,8 +17,8 @@
 #include <memory>  // for unique_ptr
 #include <string>
 #include <vector>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 DECLARE_bool(dump_jitcode);
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
index 46da6fba2e9..07d69658632 100644
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -13,7 +13,9 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/helper.h"
+
 #include <numeric>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -112,10 +114,11 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
   int block, rest;
   const auto groups = packed_groups(n, k, &block, &rest);
   std::for_each(groups.begin(), groups.end(), [&](int i) {
-    PADDLE_ENFORCE_GT(i, 0, platform::errors::InvalidArgument(
-                                "Each element of groups should be larger than "
-                                "0. However the element: %d doesn't satify.",
-                                i));
+    PADDLE_ENFORCE_GT(i, 0,
+                      platform::errors::InvalidArgument(
+                          "Each element of groups should be larger than "
+                          "0. However the element: %d doesn't satify.",
+                          i));
   });
   int sum = std::accumulate(groups.begin(), groups.end(), 0);
   std::memset(dst, 0, k * sum * block * sizeof(float));
diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h
index 9a48d9c3c8d..0389828b495 100644
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <cstdint>
+
 #include "paddle/fluid/operators/jit/macro.h"
 #include "paddle/fluid/platform/macros.h"
 
diff --git a/paddle/fluid/operators/jit/kernel_key.cc b/paddle/fluid/operators/jit/kernel_key.cc
index 4f652002bc7..528aec9ace1 100644
--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/kernel_key.h"
+
 #include <xxhash.h>  // XXH64: 13.8 GB/s
 
 namespace paddle {
diff --git a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
index 7e1f7ab8bf8..f11a690523b 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
@@ -13,7 +13,9 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h"
+
 #include <limits>
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
index 61d8c50c568..ef8fe6963c0 100644
--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
@@ -13,7 +13,9 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/more/intrinsic/layer_norm.h"
+
 #include <limits>
+
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc
index a4459cee5b8..f0008d4152f 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/more/mix/mix.h"
+
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/jit/registry.h"
 
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
index 75ebddb1259..16bf045aa66 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/more/mkl/mkl.h"
+
 #include "paddle/fluid/operators/jit/refer/refer.h"
 #include "paddle/fluid/operators/jit/registry.h"
 #include "paddle/fluid/platform/cpu_info.h"
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index 5f3c29ad5ef..ad04b4618cb 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -117,10 +117,11 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
             "The idx shoud be lower than the attribute table_height of "
             "EmbSeqPool. But %dth of idx is %d and table_height is %d.",
             i, idx[i], attr->table_height));
-    PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
-                                     "The idx shoud be equal to or larger than "
-                                     "the 0. But %dth of idx is %d.",
-                                     i, idx[i]));
+    PADDLE_ENFORCE_GE(idx[i], 0,
+                      platform::errors::InvalidArgument(
+                          "The idx shoud be equal to or larger than "
+                          "the 0. But %dth of idx is %d.",
+                          i, idx[i]));
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -204,11 +205,12 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
                             "less than the attribute. But %dth of rows "
                             "is %d and grad_width is %d.",
                             i, h_idx, attr->param_height));
-      PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
-                                      "The rows of Sgd should be "
-                                      "larger than 0. But %dth of rows "
-                                      "is %d.",
-                                      i, h_idx));
+      PADDLE_ENFORCE_GE(
+          h_idx, 0,
+          platform::errors::InvalidArgument("The rows of Sgd should be "
+                                            "larger than 0. But %dth of rows "
+                                            "is %d.",
+                                            i, h_idx));
       VAXPY(scalar, grad + i * width, out + h_idx * width, width);
     }
   } else {
@@ -220,11 +222,12 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
                             "less than the attribute. But %dth of rows "
                             "is %d and grad_width is %d.",
                             i, h_idx, attr->param_height));
-      PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
-                                      "The rows of Sgd should be "
-                                      "larger than 0. But %dth of rows "
-                                      "is %d.",
-                                      i, h_idx));
+      PADDLE_ENFORCE_GE(
+          h_idx, 0,
+          platform::errors::InvalidArgument("The rows of Sgd should be "
+                                            "larger than 0. But %dth of rows "
+                                            "is %d.",
+                                            i, h_idx));
       VScal(&scalar, grad + i * width, out + h_idx * width, width);
       VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width,
            width);
diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc
index 779d4c172b8..9919f2d46dd 100644
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -13,6 +13,7 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jit/refer/refer.h"
+
 #include "paddle/fluid/operators/jit/registry.h"
 
 namespace refer = paddle::operators::jit::refer;
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index 79b2e174efc..3f1e5b3235b 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -481,10 +481,11 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
             "The idx shoud be lower than the attribute table_height of "
             "EmbSeqPool. But %dth of idx is %d and table_height is %d.",
             i, idx[i], attr->table_height));
-    PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
-                                     "The idx shoud be equal to or larger than "
-                                     "the 0. But %dth of idx is %d.",
-                                     i, idx[i]));
+    PADDLE_ENFORCE_GE(idx[i], 0,
+                      platform::errors::InvalidArgument(
+                          "The idx shoud be equal to or larger than "
+                          "the 0. But %dth of idx is %d.",
+                          i, idx[i]));
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -539,11 +540,12 @@ void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
                           "less than the attribute. But %dth of rows "
                           "is %d and grad_width is %d.",
                           i, h_idx, attr->param_height));
-    PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
-                                    "The rows of Sgd should be "
-                                    "larger than 0. But %dth of rows "
-                                    "is %d.",
-                                    i, h_idx));
+    PADDLE_ENFORCE_GE(
+        h_idx, 0,
+        platform::errors::InvalidArgument("The rows of Sgd should be "
+                                          "larger than 0. But %dth of rows "
+                                          "is %d.",
+                                          i, h_idx));
     for (int64_t j = 0; j < attr->grad_width; ++j) {
       out[h_idx * attr->grad_width + j] =
           param[h_idx * attr->grad_width + j] -
diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h
index 567a9032369..15d5e605b01 100644
--- a/paddle/fluid/operators/jit/registry.h
+++ b/paddle/fluid/operators/jit/registry.h
@@ -18,6 +18,7 @@
 #include <tuple>
 #include <type_traits>
 #include <utility>  // for std::move
+
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/operators/jit/kernel_pool.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index 74f2d62c64d..27e816248ab 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -286,48 +286,48 @@ void TestKernelLSTM() {
             ref(&step, &attr);
             VLOG(10) << attr;
 
-            auto verifier = [](
-                const typename KernelTuple::func_type tgt,
-                const std::vector<T>& xsrc, const std::vector<T>& wp,
-                const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
-                const std::vector<T>& ht_ref,
-                const typename KernelTuple::attr_type& attr) {
-              EXPECT_TRUE(tgt != nullptr);
-              EXPECT_EQ(ct_ref.size(), ht_ref.size());
-              EXPECT_EQ(ct_1.size(), ht_ref.size());
-              EXPECT_EQ(xsrc.size(), 4 * ht_ref.size());
-              EXPECT_EQ(wp.size(), 3 * ht_ref.size());
-
-              // x could be changed after compute, so copy to save src
-              int d = ht_ref.size();
-              std::vector<T> x(xsrc.size()), ct(ct_ref.size()),
-                  ht(ht_ref.size());
-              std::vector<T> checked(2 * d);
-              std::copy(xsrc.begin(), xsrc.end(), x.begin());
-
-              const T* ct_1_data = ct_1.data();
-              const T* wp_data = wp.data();
-              const T* ct_ref_data = ct_ref.data();
-              const T* ht_ref_data = ht_ref.data();
-              T* x_data = x.data();
-              T* ct_data = ct.data();
-              T* ht_data = ht.data();
-              T* checked_data = checked.data();
-
-              jit::lstm_t step;
-              step.gates = x_data;
-              step.ct_1 = ct_1_data;
-              step.ct = ct_data;
-              step.ht = ht_data;
-              if (attr.use_peephole) {
-                step.wp = wp_data;
-                step.checked = checked_data;
-              }
-
-              tgt(&step, &attr);
-              ExpectEQ<T>(ct_data, ct_ref_data, d);
-              ExpectEQ<T>(ht_data, ht_ref_data, d);
-            };
+            auto verifier =
+                [](const typename KernelTuple::func_type tgt,
+                   const std::vector<T>& xsrc, const std::vector<T>& wp,
+                   const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
+                   const std::vector<T>& ht_ref,
+                   const typename KernelTuple::attr_type& attr) {
+                  EXPECT_TRUE(tgt != nullptr);
+                  EXPECT_EQ(ct_ref.size(), ht_ref.size());
+                  EXPECT_EQ(ct_1.size(), ht_ref.size());
+                  EXPECT_EQ(xsrc.size(), 4 * ht_ref.size());
+                  EXPECT_EQ(wp.size(), 3 * ht_ref.size());
+
+                  // x could be changed after compute, so copy to save src
+                  int d = ht_ref.size();
+                  std::vector<T> x(xsrc.size()), ct(ct_ref.size()),
+                      ht(ht_ref.size());
+                  std::vector<T> checked(2 * d);
+                  std::copy(xsrc.begin(), xsrc.end(), x.begin());
+
+                  const T* ct_1_data = ct_1.data();
+                  const T* wp_data = wp.data();
+                  const T* ct_ref_data = ct_ref.data();
+                  const T* ht_ref_data = ht_ref.data();
+                  T* x_data = x.data();
+                  T* ct_data = ct.data();
+                  T* ht_data = ht.data();
+                  T* checked_data = checked.data();
+
+                  jit::lstm_t step;
+                  step.gates = x_data;
+                  step.ct_1 = ct_1_data;
+                  step.ct = ct_data;
+                  step.ht = ht_data;
+                  if (attr.use_peephole) {
+                    step.wp = wp_data;
+                    step.checked = checked_data;
+                  }
+
+                  tgt(&step, &attr);
+                  ExpectEQ<T>(ct_data, ct_ref_data, d);
+                  ExpectEQ<T>(ht_data, ht_ref_data, d);
+                };
             TestAllImpls<KernelTuple, PlaceType>(attr, verifier, xsrc, wp, ct_1,
                                                  ct_ref, ht_ref, attr);
           }
@@ -484,41 +484,42 @@ void TestKernelLayerNorm() {
         ref(x_data, outref_data, mean_data, var_data, scale_data, bias_data,
             left, epsilon, right);
 
-        auto verifier = [](
-            const typename KernelTuple::func_type tgt, const std::vector<T>& x_,
-            const std::vector<T>& outref_, const std::vector<T>& mean_,
-            const std::vector<T>& var_, const std::vector<T>& scale,
-            const std::vector<T>& bias, const int& left, const float& epsilon,
-            const typename KernelTuple::attr_type& right) {
-          EXPECT_TRUE(tgt != nullptr);
-          std::vector<T> outtgt(outref_.size());
-          std::vector<T> x(x_.size());
-          std::vector<T> mean(mean_.size());
-          std::vector<T> var(var_.size());
-          std::vector<T> outref(outref_.size());
-          std::copy(x_.begin(), x_.end(), x.begin());
-          std::copy(mean_.begin(), mean_.end(), mean.begin());
-          std::copy(var_.begin(), var_.end(), var.begin());
-          std::copy(outref_.begin(), outref_.end(), outref.begin());
-
-          EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
-          EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
-          EXPECT_EQ(mean.size(), static_cast<size_t>(left));
-          EXPECT_EQ(var.size(), static_cast<size_t>(left));
-          EXPECT_EQ(scale.size(), static_cast<size_t>(right));
-          EXPECT_EQ(bias.size(), static_cast<size_t>(right));
-
-          const T* scale_data = scale.data();
-          const T* bias_data = bias.data();
-          T* x_data = x.data();
-          T* mean_data = mean.data();
-          T* var_data = var.data();
-          T* outref_data = outref.data();
-          T* outtgt_data = outtgt.data();
-          tgt(x_data, outtgt_data, mean_data, var_data, scale_data, bias_data,
-              left, epsilon, right);
-          ExpectEQ<T>(outtgt_data, outref_data, left * right);
-        };
+        auto verifier =
+            [](const typename KernelTuple::func_type tgt,
+               const std::vector<T>& x_, const std::vector<T>& outref_,
+               const std::vector<T>& mean_, const std::vector<T>& var_,
+               const std::vector<T>& scale, const std::vector<T>& bias,
+               const int& left, const float& epsilon,
+               const typename KernelTuple::attr_type& right) {
+              EXPECT_TRUE(tgt != nullptr);
+              std::vector<T> outtgt(outref_.size());
+              std::vector<T> x(x_.size());
+              std::vector<T> mean(mean_.size());
+              std::vector<T> var(var_.size());
+              std::vector<T> outref(outref_.size());
+              std::copy(x_.begin(), x_.end(), x.begin());
+              std::copy(mean_.begin(), mean_.end(), mean.begin());
+              std::copy(var_.begin(), var_.end(), var.begin());
+              std::copy(outref_.begin(), outref_.end(), outref.begin());
+
+              EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
+              EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
+              EXPECT_EQ(mean.size(), static_cast<size_t>(left));
+              EXPECT_EQ(var.size(), static_cast<size_t>(left));
+              EXPECT_EQ(scale.size(), static_cast<size_t>(right));
+              EXPECT_EQ(bias.size(), static_cast<size_t>(right));
+
+              const T* scale_data = scale.data();
+              const T* bias_data = bias.data();
+              T* x_data = x.data();
+              T* mean_data = mean.data();
+              T* var_data = var.data();
+              T* outref_data = outref.data();
+              T* outtgt_data = outtgt.data();
+              tgt(x_data, outtgt_data, mean_data, var_data, scale_data,
+                  bias_data, left, epsilon, right);
+              ExpectEQ<T>(outtgt_data, outref_data, left * right);
+            };
         TestAllImpls<KernelTuple, PlaceType>(right, verifier, x, outref, mean,
                                              var, scale, bias, left, epsilon,
                                              right);
@@ -548,11 +549,12 @@ void TestKernelCRFDecoding() {
       ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
           trackref.data(), tag_num);
 
-      auto verifier = [](
-          const typename KernelTuple::func_type tgt, const int& seq_len,
-          const std::vector<T>& x, const std::vector<T>& w,
-          const std::vector<T>& alpharef, const std::vector<int>& trackref,
-          const typename KernelTuple::attr_type& tag_num) {
+      auto verifier = [](const typename KernelTuple::func_type tgt,
+                         const int& seq_len, const std::vector<T>& x,
+                         const std::vector<T>& w,
+                         const std::vector<T>& alpharef,
+                         const std::vector<int>& trackref,
+                         const typename KernelTuple::attr_type& tag_num) {
         constexpr int state_trans_base_idx = 2;
         EXPECT_TRUE(tgt != nullptr);
         EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
@@ -878,12 +880,13 @@ void TestKernelAdam() {
       mom2.data(), param.data(), mom1_out.data(), mom2_out.data(),
       param_out.data());
 
-  auto verifier = [](
-      const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr, T eps,
-      int64_t numel, const std::vector<T>& grad, const std::vector<T>& mom1,
-      const std::vector<T>& mom2, const std::vector<T>& param,
-      const std::vector<T>& ref_mom1_out, const std::vector<T>& ref_mom2_out,
-      const std::vector<T>& ref_param_out) {
+  auto verifier = [](const typename KernelTuple::func_type tgt, T beta1,
+                     T beta2, T lr, T eps, int64_t numel,
+                     const std::vector<T>& grad, const std::vector<T>& mom1,
+                     const std::vector<T>& mom2, const std::vector<T>& param,
+                     const std::vector<T>& ref_mom1_out,
+                     const std::vector<T>& ref_mom2_out,
+                     const std::vector<T>& ref_param_out) {
     EXPECT_TRUE(tgt != nullptr);
     EXPECT_EQ(param.size(), static_cast<size_t>(numel));
     EXPECT_EQ(grad.size(), static_cast<size_t>(numel));
@@ -944,30 +947,31 @@ void TestKernelAdamW() {
       grad.data(), mom1.data(), mom2.data(), param.data(), mom1_out.data(),
       mom2_out.data(), param_out.data());
 
-  auto verifier = [](
-      const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr, T eps,
-      T old_lr, T lr_ratio, T coeff, int64_t numel, const std::vector<T>& grad,
-      const std::vector<T>& mom1, const std::vector<T>& mom2,
-      const std::vector<T>& param, const std::vector<T>& ref_mom1_out,
-      const std::vector<T>& ref_mom2_out, const std::vector<T>& ref_param_out) {
-    EXPECT_TRUE(tgt != nullptr);
-    EXPECT_EQ(param.size(), static_cast<size_t>(numel));
-    EXPECT_EQ(grad.size(), static_cast<size_t>(numel));
-    EXPECT_EQ(mom1.size(), static_cast<size_t>(numel));
-    EXPECT_EQ(mom2.size(), static_cast<size_t>(numel));
-
-    std::vector<T> jit_mom1_out(ref_mom1_out.size());
-    std::vector<T> jit_mom2_out(ref_mom2_out.size());
-    std::vector<T> jit_param_out(ref_param_out.size());
-
-    tgt(beta1, beta2, -lr, eps, old_lr, lr_ratio, coeff, numel, grad.data(),
-        mom1.data(), mom2.data(), param.data(), jit_mom1_out.data(),
-        jit_mom2_out.data(), jit_param_out.data());
-
-    ExpectEQ<T>(ref_mom1_out.data(), jit_mom1_out.data(), numel);
-    ExpectEQ<T>(ref_mom2_out.data(), jit_mom2_out.data(), numel);
-    ExpectEQ<T>(ref_param_out.data(), jit_param_out.data(), numel);
-  };
+  auto verifier =
+      [](const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr,
+         T eps, T old_lr, T lr_ratio, T coeff, int64_t numel,
+         const std::vector<T>& grad, const std::vector<T>& mom1,
+         const std::vector<T>& mom2, const std::vector<T>& param,
+         const std::vector<T>& ref_mom1_out, const std::vector<T>& ref_mom2_out,
+         const std::vector<T>& ref_param_out) {
+        EXPECT_TRUE(tgt != nullptr);
+        EXPECT_EQ(param.size(), static_cast<size_t>(numel));
+        EXPECT_EQ(grad.size(), static_cast<size_t>(numel));
+        EXPECT_EQ(mom1.size(), static_cast<size_t>(numel));
+        EXPECT_EQ(mom2.size(), static_cast<size_t>(numel));
+
+        std::vector<T> jit_mom1_out(ref_mom1_out.size());
+        std::vector<T> jit_mom2_out(ref_mom2_out.size());
+        std::vector<T> jit_param_out(ref_param_out.size());
+
+        tgt(beta1, beta2, -lr, eps, old_lr, lr_ratio, coeff, numel, grad.data(),
+            mom1.data(), mom2.data(), param.data(), jit_mom1_out.data(),
+            jit_mom2_out.data(), jit_param_out.data());
+
+        ExpectEQ<T>(ref_mom1_out.data(), jit_mom1_out.data(), numel);
+        ExpectEQ<T>(ref_mom2_out.data(), jit_mom2_out.data(), numel);
+        ExpectEQ<T>(ref_param_out.data(), jit_param_out.data(), numel);
+      };
 
   TestAllImpls<KernelTuple, PlaceType>(
       1, verifier, beta1, beta2, learning_rate, eps, old_lr, lr_ratio, coeff,
@@ -988,8 +992,9 @@ void TestKernelSgd() {
                           "and n-1 is %d.",
                           static_cast<size_t>(upper - lower), n - 1));
     PADDLE_ENFORCE_GT(
-        n, 0, paddle::platform::errors::InvalidArgument(
-                  "The Sgd size should be larger than 0. But the n is %d.", n));
+        n, 0,
+        paddle::platform::errors::InvalidArgument(
+            "The Sgd size should be larger than 0. But the n is %d.", n));
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
@@ -1031,11 +1036,12 @@ void TestKernelSgd() {
                       grad_w);
         }
 
-        auto verifier = [](
-            const typename KernelTuple::func_type tgt, const T lr,
-            const std::vector<T>& param, const std::vector<T>& grad,
-            const std::vector<int64_t>& rows, const std::vector<T>& oref,
-            const typename KernelTuple::attr_type& attr) {
+        auto verifier = [](const typename KernelTuple::func_type tgt,
+                           const T lr, const std::vector<T>& param,
+                           const std::vector<T>& grad,
+                           const std::vector<int64_t>& rows,
+                           const std::vector<T>& oref,
+                           const typename KernelTuple::attr_type& attr) {
           EXPECT_TRUE(tgt != nullptr);
           EXPECT_EQ(param.size(),
                     static_cast<size_t>(attr.param_height * attr.param_width));
diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
index 169befc88f2..82de4c82d11 100644
--- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
@@ -19,4 +19,4 @@ namespace paddle {
 namespace operators {
 namespace kernel_primitives = phi::kps;
 }
-}
+}  // namespace paddle
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
index 67c1942ea0b..8597c21b3ec 100644
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -11,6 +11,7 @@
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/binary.h"
diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc
index eac181489aa..41499f3f7bf 100644
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the Licnse. */
 
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/kthvalue_op.cc b/paddle/fluid/operators/kthvalue_op.cc
index 4c679d30263..1ff9ab796e9 100644
--- a/paddle/fluid/operators/kthvalue_op.cc
+++ b/paddle/fluid/operators/kthvalue_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index ddd0554add5..7a6a28a33c1 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/l1_norm_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index 7e07610db28..e14e6100647 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 0c5946b4ae4..ac20a5962f3 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -661,7 +661,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
  * output is [1, 1024].
  * #blocks: 32
  * #threads: 512
-*/
+ */
 // todo(@limin29): to think if there are better impl strategies
 template <
     typename U, typename ScaleT = U, int VecSize = 1, int WARPS_M = 16,
@@ -783,16 +783,16 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
 }
 
 /* This function support two kinds of computations (only for float and fp16
-* type):
-*
-* Case-1: compute layer_norm_grad for layernorm op by setting mask_ptr and
-* d_dropout_src_ptr to nullptr. Here, d_x_ptr returns the grad of layernorm
-* input.
-*
-* Case-2: compute layer_norm_grad + residual_grad + dropout_grad for
-* fused_dropout_residual_layernorm op. Here, dx_ptr returns residual_grad.
-*
-*/
+ * type):
+ *
+ * Case-1: compute layer_norm_grad for layernorm op by setting mask_ptr and
+ * d_dropout_src_ptr to nullptr. Here, d_x_ptr returns the grad of layernorm
+ * input.
+ *
+ * Case-2: compute layer_norm_grad + residual_grad + dropout_grad for
+ * fused_dropout_residual_layernorm op. Here, dx_ptr returns residual_grad.
+ *
+ */
 template <typename T, typename U, typename ScaleT = U,
           typename MaskType = uint8_t>
 void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
@@ -839,19 +839,19 @@ void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
             "To compute fused_dropout_residual_ln grad, d_dropout_src_ptr "
             "can't be null"));
       }
-      fused_ln_bwd_1024_kernel<
-          true, T, U, ScaleT, MaskType, VecSize, WARPS_M, WARPS_N,
-          BYTES_PER_LDG><<<gridx, THREADS_PER_CTA, 0, stream>>>(
-          rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,
-          dscale_temp_ptr, dbias_temp_ptr, dx_ptr, mask_ptr, factor,
-          d_dropout_src_ptr);
+      fused_ln_bwd_1024_kernel<true, T, U, ScaleT, MaskType, VecSize, WARPS_M,
+                               WARPS_N, BYTES_PER_LDG>
+          <<<gridx, THREADS_PER_CTA, 0, stream>>>(
+              rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,
+              dscale_temp_ptr, dbias_temp_ptr, dx_ptr, mask_ptr, factor,
+              d_dropout_src_ptr);
 
     } else {
-      fused_ln_bwd_1024_kernel<
-          false, T, U, ScaleT, MaskType, VecSize, WARPS_M, WARPS_N,
-          BYTES_PER_LDG><<<gridx, THREADS_PER_CTA, 0, stream>>>(
-          rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,
-          dscale_temp_ptr, dbias_temp_ptr, dx_ptr);
+      fused_ln_bwd_1024_kernel<false, T, U, ScaleT, MaskType, VecSize, WARPS_M,
+                               WARPS_N, BYTES_PER_LDG>
+          <<<gridx, THREADS_PER_CTA, 0, stream>>>(
+              rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,
+              dscale_temp_ptr, dbias_temp_ptr, dx_ptr);
     }
     const int WARPS_M_2 = 16;
     const int WARPS_N_2 = 1;
@@ -873,10 +873,10 @@ void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Only support float and fp16 type"));
     } else {
-      ln_bwd_1024_final_kernel<
-          U, ScaleT, VecSize_2, WARPS_M_2, WARPS_N_2,
-          BYTES_PER_LDG_2><<<gridx_2, THREADS_PER_CTA_2, 0, stream>>>(
-          gridx, dscale_temp_ptr, dbias_temp_ptr, dscale_ptr, dbias_ptr);
+      ln_bwd_1024_final_kernel<U, ScaleT, VecSize_2, WARPS_M_2, WARPS_N_2,
+                               BYTES_PER_LDG_2>
+          <<<gridx_2, THREADS_PER_CTA_2, 0, stream>>>(
+              gridx, dscale_temp_ptr, dbias_temp_ptr, dscale_ptr, dbias_ptr);
     }
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
@@ -1387,16 +1387,17 @@ static void LayerNormBackward(
   if (gradient_flag == 0) return;
 
   if (batch_size == 1) {
-    LayerNormBackwardWhenBatchSizeIsOne<T, U, ScaleBiasWithSameTypeX><<<
-        (feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0,
-        stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon,
-                  feature_size);
+    LayerNormBackwardWhenBatchSizeIsOne<T, U, ScaleBiasWithSameTypeX>
+        <<<(feature_size + kMaxBlockDim - 1) / kMaxBlockDim, kMaxBlockDim, 0,
+           stream>>>(x, d_y, d_x, d_scale, d_bias, mean, var, scale, epsilon,
+                     feature_size);
 
     if (d_x != nullptr) {
       switch (GetDesiredBlockDim(feature_size)) {
-        FIXED_BLOCK_DIM_CASE(LayerNormBackwardPostProcessToCalculateDX<
-                             T, U, kBlockDim><<<1, kBlockDim, 0, stream>>>(
-            x, d_x, mean, var, epsilon, feature_size));
+        FIXED_BLOCK_DIM_CASE(
+            LayerNormBackwardPostProcessToCalculateDX<T, U, kBlockDim>
+            <<<1, kBlockDim, 0, stream>>>(x, d_x, mean, var, epsilon,
+                                          feature_size));
       }
     }
     return;
@@ -1408,9 +1409,9 @@ static void LayerNormBackward(
       switch (block_dim) {
         FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
             feature_size, kMaxBlockNum,
-            LayerNormBackwardGradientScaleOrBias<
-                T, U, kBlockDim, false, false,
-                ScaleBiasWithSameTypeX><<<block_num, kBlockDim, 0, stream>>>(
+            LayerNormBackwardGradientScaleOrBias<T, U, kBlockDim, false, false,
+                                                 ScaleBiasWithSameTypeX>
+            <<<block_num, kBlockDim, 0, stream>>>(
                 x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
                 batch_size, feature_size, col_offset));
       }
@@ -1419,9 +1420,9 @@ static void LayerNormBackward(
       switch (block_dim) {
         FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
             feature_size, kMaxBlockNum,
-            LayerNormBackwardGradientScaleOrBias<
-                T, U, kBlockDim, false, true,
-                ScaleBiasWithSameTypeX><<<block_num, kBlockDim, 0, stream>>>(
+            LayerNormBackwardGradientScaleOrBias<T, U, kBlockDim, false, true,
+                                                 ScaleBiasWithSameTypeX>
+            <<<block_num, kBlockDim, 0, stream>>>(
                 x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
                 batch_size, feature_size, col_offset));
       }
@@ -1430,9 +1431,9 @@ static void LayerNormBackward(
       switch (block_dim) {
         FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
             feature_size, kMaxBlockNum,
-            LayerNormBackwardGradientAll<
-                T, U, kBlockDim, false,
-                ScaleBiasWithSameTypeX><<<block_num, kBlockDim, 0, stream>>>(
+            LayerNormBackwardGradientAll<T, U, kBlockDim, false,
+                                         ScaleBiasWithSameTypeX>
+            <<<block_num, kBlockDim, 0, stream>>>(
                 x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
                 batch_size, feature_size, col_offset));
       }
@@ -1440,9 +1441,9 @@ static void LayerNormBackward(
     case 4:  // d_x != nullptr, d_scale == nullptr, d_bias == nullptr
       switch (GetDesiredBlockDim(feature_size)) {
         FIXED_BLOCK_DIM_CASE(
-            LayerNormBackwardGradientOnlyDX<
-                T, U, kBlockDim,
-                ScaleBiasWithSameTypeX><<<batch_size, kBlockDim, 0, stream>>>(
+            LayerNormBackwardGradientOnlyDX<T, U, kBlockDim,
+                                            ScaleBiasWithSameTypeX>
+            <<<batch_size, kBlockDim, 0, stream>>>(
                 x, d_y, d_x, mean, var, scale, epsilon, feature_size));
       }
       break;
@@ -1450,34 +1451,34 @@ static void LayerNormBackward(
       switch (block_dim) {
         FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
             feature_size, kMaxBlockNum,
-            LayerNormBackwardGradientScaleOrBias<
-                T, U, kBlockDim, true, false,
-                ScaleBiasWithSameTypeX><<<block_num, kBlockDim, 0, stream>>>(
+            LayerNormBackwardGradientScaleOrBias<T, U, kBlockDim, true, false,
+                                                 ScaleBiasWithSameTypeX>
+            <<<block_num, kBlockDim, 0, stream>>>(
                 x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
                 batch_size, feature_size, col_offset));
       }
       switch (GetDesiredBlockDim(feature_size)) {
         FIXED_BLOCK_DIM_CASE(
-            LayerNormBackwardPostProcessToCalculateDX<
-                T, U, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-                x, d_x, mean, var, epsilon, feature_size));
+            LayerNormBackwardPostProcessToCalculateDX<T, U, kBlockDim>
+            <<<batch_size, kBlockDim, 0, stream>>>(x, d_x, mean, var, epsilon,
+                                                   feature_size));
       }
       break;
     case 6:  // d_x != nullptr, d_scale != nullptr, d_bias == nullptr
       switch (block_dim) {
         FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(
             feature_size, kMaxBlockNum,
-            LayerNormBackwardGradientScaleOrBias<
-                T, U, kBlockDim, true, true,
-                ScaleBiasWithSameTypeX><<<block_num, kBlockDim, 0, stream>>>(
+            LayerNormBackwardGradientScaleOrBias<T, U, kBlockDim, true, true,
+                                                 ScaleBiasWithSameTypeX>
+            <<<block_num, kBlockDim, 0, stream>>>(
                 x, d_y, d_scale, d_bias, d_x, mean, var, scale, epsilon,
                 batch_size, feature_size, col_offset));
       }
       switch (GetDesiredBlockDim(feature_size)) {
         FIXED_BLOCK_DIM_CASE(
-            LayerNormBackwardPostProcessToCalculateDX<
-                T, U, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-                x, d_x, mean, var, epsilon, feature_size));
+            LayerNormBackwardPostProcessToCalculateDX<T, U, kBlockDim>
+            <<<batch_size, kBlockDim, 0, stream>>>(x, d_x, mean, var, epsilon,
+                                                   feature_size));
       }
       break;
     case 7:  // d_x != nullptr, d_scale != nullptr, d_bias != nullptr
@@ -1511,29 +1512,30 @@ static void LayerNormBackward(
         U *part_grad_gamma = reinterpret_cast<U *>(part_grad_gamma_ptr->ptr());
         U *part_grad_beta = reinterpret_cast<U *>(part_grad_beta_ptr->ptr());
 
-        LayerNormBackwardPartGradGammaBeta<
-            T, U, BDIMX2, BDIMY2, VPT><<<blocks2, threads2, 0, stream>>>(
-            d_y, x, batch_size, feature_size, mean, var, epsilon,
-            part_grad_gamma,
-            part_grad_beta);  // compute part_grad_gamma, beta
+        LayerNormBackwardPartGradGammaBeta<T, U, BDIMX2, BDIMY2, VPT>
+            <<<blocks2, threads2, 0, stream>>>(
+                d_y, x, batch_size, feature_size, mean, var, epsilon,
+                part_grad_gamma,
+                part_grad_beta);  // compute part_grad_gamma, beta
 
         constexpr int BDIMX3 = 32;
         constexpr int BDIMY3 = 8;
         dim3 threads3(BDIMX3, BDIMY3, 1);
         const dim3 blocks3((feature_size + BDIMX2 - 1) / BDIMX2, 1, 1);
-        LayerNormBackwardSumGradGammaBeta<
-            T, U, BDIMX3, BDIMY3,
-            ScaleBiasWithSameTypeX><<<blocks3, threads3, 0, stream>>>(
-            part_grad_gamma, part_grad_beta, part_size, batch_size,
-            feature_size, d_scale, d_bias);
+        LayerNormBackwardSumGradGammaBeta<T, U, BDIMX3, BDIMY3,
+                                          ScaleBiasWithSameTypeX>
+            <<<blocks3, threads3, 0, stream>>>(part_grad_gamma, part_grad_beta,
+                                               part_size, batch_size,
+                                               feature_size, d_scale, d_bias);
 
         constexpr int BDIMX1 = 32;
         constexpr int BDIMY1 = 4;
         dim3 threads1(BDIMX1, BDIMY1, 1);
-        LayerNormBackwardComputeGradInput<
-            T, U, BDIMX1, BDIMY1,
-            ScaleBiasWithSameTypeX><<<batch_size, threads1, 0, stream>>>(
-            d_y, x, batch_size, feature_size, mean, var, epsilon, scale, d_x);
+        LayerNormBackwardComputeGradInput<T, U, BDIMX1, BDIMY1,
+                                          ScaleBiasWithSameTypeX>
+            <<<batch_size, threads1, 0, stream>>>(d_y, x, batch_size,
+                                                  feature_size, mean, var,
+                                                  epsilon, scale, d_x);
 #ifdef PADDLE_WITH_CUDA
       }
 #endif
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 224ab748dab..3d1e563ef1a 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc
index 3b21a55f8df..a27952c57f7 100644
--- a/paddle/fluid/operators/layer_norm_op_xpu.cc
+++ b/paddle/fluid/operators/layer_norm_op_xpu.cc
@@ -88,8 +88,9 @@ class LayerNormGradXPUKernel : public framework::OpKernel<T> {
     auto* dscale_data =
         (dscale == nullptr ? nullptr
                            : dscale->mutable_data<float>(ctx.GetPlace()));
-    auto* dbias_data = (dbias == nullptr ? nullptr : dbias->mutable_data<float>(
-                                                         ctx.GetPlace()));
+    auto* dbias_data =
+        (dbias == nullptr ? nullptr
+                          : dbias->mutable_data<float>(ctx.GetPlace()));
     auto* dx_data =
         (dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
diff --git a/paddle/fluid/operators/layout_utils.h b/paddle/fluid/operators/layout_utils.h
index e304f33d045..f058afdb4ad 100644
--- a/paddle/fluid/operators/layout_utils.h
+++ b/paddle/fluid/operators/layout_utils.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index 7308363b9fe..a6ef87d43e2 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -28,9 +28,10 @@ static inline T NormalizeL1(T* x, size_t len) {
   // Right now, we just bet that sum won't be zero. If this really happens, we
   // will figure out what should be done then.
   PADDLE_ENFORCE_GT(
-      sum, 0., platform::errors::InvalidArgument(
-                   "The unnormalized probabilities of all possible unfinished "
-                   "sequences must be greater than 0."));
+      sum, 0.,
+      platform::errors::InvalidArgument(
+          "The unnormalized probabilities of all possible unfinished "
+          "sequences must be greater than 0."));
   T s = 1. / sum;
   for (size_t i = 0; i < len; ++i) x[i] *= s;
   return sum;
@@ -44,8 +45,8 @@ struct ScalarMul {
   T scalar;
 };
 
-using framework::LoDTensor;
 using framework::LoD;
+using framework::LoDTensor;
 using framework::Tensor;
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index e9375be1706..5e451d99dbc 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -77,10 +77,9 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     LinspaceInferShapeFunctor);
 
-REGISTER_OP_VERSION(linspace)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(linspace).AddCheckpoint(
+    R"ROC(
       Upgrade linspace to add a new attribute [dtype].
     )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "dtype", "In order to change output data type ", 5));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "dtype", "In order to change output data type ", 5));
diff --git a/paddle/fluid/operators/lite/lite_engine_op.cc b/paddle/fluid/operators/lite/lite_engine_op.cc
index 7a879c1e216..0ec1c55f7ab 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lite/lite_engine_op.h"
+
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
index 5d2a1683d38..240f6b06325 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -26,11 +26,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-
 #include "paddle/fluid/inference/lite/engine.h"
 #include "paddle/fluid/inference/lite/tensor_utils.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc
index 01583cea312..c38386365f3 100644
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -12,6 +12,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License. */
 
+#include "paddle/fluid/operators/lite/lite_engine_op.h"
+
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/block_desc.h"
@@ -19,13 +21,12 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/operators/lite/lite_engine_op.h"
 #include "paddle/fluid/operators/lite/ut_helper.h"
 
 USE_NO_KERNEL_OP(lite_engine)
 
-using paddle::inference::lite::AddTensorToBlockDesc;
 using paddle::inference::lite::AddFetchListToBlockDesc;
+using paddle::inference::lite::AddTensorToBlockDesc;
 using paddle::inference::lite::CreateTensor;
 using paddle::inference::lite::serialize_params;
 namespace paddle {
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index 374bfa73f21..94797b08ade 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/load_combine_op.h"
+
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/operators/load_combine_op.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index ba19aee9b8d..196792707eb 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <string>
-
 #include "paddle/fluid/operators/load_op.h"
 
+#include <string>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index 56163096833..616aad2b976 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lod_reset_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index 642c8bcd9ae..f6f7155f37c 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index 883e3597d8a..11edbc84a19 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
index f103a69707a..99ccad1ca76 100644
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <cmath>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
index fee1f56ebdc..1ba0a0f3b3d 100644
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include <memory>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc
index da38f906b9b..95ebeedaf79 100644
--- a/paddle/fluid/operators/log_softmax_op.cc
+++ b/paddle/fluid/operators/log_softmax_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 29079b8b138..c519e0845f7 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -113,26 +113,22 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
     dim3 grids(8, 1);
 #ifdef PADDLE_WITH_HIP
     if (padding_idx == -1)
-      LookupTable<
-          T, 64, 4, 8,
-          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+      LookupTable<T, 64, 4, 8, false>
+          <<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+              output, table, ids, N, K, D, padding_idx);
     else
-      LookupTable<
-          T, 64, 4, 8,
-          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+      LookupTable<T, 64, 4, 8, true>
+          <<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+              output, table, ids, N, K, D, padding_idx);
 #else
     if (padding_idx == -1)
-      LookupTable<
-          T, 128, 8, 8,
-          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+      LookupTable<T, 128, 8, 8, false>
+          <<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+              output, table, ids, N, K, D, padding_idx);
     else
-      LookupTable<
-          T, 128, 8, 8,
-          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+      LookupTable<T, 128, 8, 8, true>
+          <<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+              output, table, ids, N, K, D, padding_idx);
 #endif  // PADDLE_WITH_HIP
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cc b/paddle/fluid/operators/lookup_table_v2_op.cc
index 48ae080783d..65aeca1e499 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_v2_op.h"
 
 #include <memory>
+
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index c2df6dff5b5..c47ea64e24c 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <iostream>
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
diff --git a/paddle/fluid/operators/lookup_table_v2_op_xpu.cc b/paddle/fluid/operators/lookup_table_v2_op_xpu.cc
index 521d3ab571e..223bf2cc867 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_xpu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_xpu.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/lookup_table_v2_op.h"
 #include <memory>
+
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/fluid/operators/lookup_table_v2_op.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #ifdef PADDLE_WITH_XPU
 namespace paddle {
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 88d70d9bb7d..17c5f08c66c 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lrn_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
@@ -174,20 +176,23 @@ class LRNOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("MidOut"), "Output", "MidOut", "LRN");
 
     auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dim.size(), 4, platform::errors::InvalidArgument(
-                                           "Input(input) rank should be 4, "
-                                           "but received input rank (%d) != 4",
-                                           x_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        x_dim.size(), 4,
+        platform::errors::InvalidArgument("Input(input) rank should be 4, "
+                                          "but received input rank (%d) != 4",
+                                          x_dim.size()));
 
     int n = ctx->Attrs().Get<int>("n");
-    PADDLE_ENFORCE_GT(n, 0UL, platform::errors::InvalidArgument(
-                                  "Argument(n) should be positive, "
-                                  "but received n(%d) not greater than 0",
-                                  n));
-    PADDLE_ENFORCE_EQ(n % 2, 1UL, platform::errors::InvalidArgument(
-                                      "Argument(n) should be odd value, "
-                                      "but received n(%d) is not an odd value",
-                                      n));
+    PADDLE_ENFORCE_GT(n, 0UL,
+                      platform::errors::InvalidArgument(
+                          "Argument(n) should be positive, "
+                          "but received n(%d) not greater than 0",
+                          n));
+    PADDLE_ENFORCE_EQ(n % 2, 1UL,
+                      platform::errors::InvalidArgument(
+                          "Argument(n) should be odd value, "
+                          "but received n(%d) is not an odd value",
+                          n));
 
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index f2d72d07405..671055caa16 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -68,18 +69,21 @@ class LRNKernel : public framework::OpKernel<T> {
     T beta = ctx.Attr<float>("beta");
     T k = ctx.Attr<float>("k");
 
-    PADDLE_ENFORCE_GE(alpha, 0UL, platform::errors::InvalidArgument(
-                                      "Argument(alpha) should >= 0.0, "
-                                      "but received alpha(%d) less than 0",
-                                      alpha));
-    PADDLE_ENFORCE_GE(beta, 0UL, platform::errors::InvalidArgument(
-                                     "Argument(beta) should >= 0.0, "
-                                     "but received beta(%d) less than 0",
-                                     beta));
-    PADDLE_ENFORCE_GE(k, 0UL, platform::errors::InvalidArgument(
-                                  "Argument(k) should >= 0.0, "
-                                  "but received k(%d) less than 0",
-                                  k));
+    PADDLE_ENFORCE_GE(
+        alpha, 0UL,
+        platform::errors::InvalidArgument("Argument(alpha) should >= 0.0, "
+                                          "but received alpha(%d) less than 0",
+                                          alpha));
+    PADDLE_ENFORCE_GE(
+        beta, 0UL,
+        platform::errors::InvalidArgument("Argument(beta) should >= 0.0, "
+                                          "but received beta(%d) less than 0",
+                                          beta));
+    PADDLE_ENFORCE_GE(
+        k, 0UL,
+        platform::errors::InvalidArgument("Argument(k) should >= 0.0, "
+                                          "but received k(%d) less than 0",
+                                          k));
 
     LRNFunctor<DeviceContext, T> f;
     f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta, data_layout);
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index 2ec9de3e3bb..21a0fce2893 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstm_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 4ec3072a96d..1e1aaf3ea53 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
@@ -272,9 +273,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
 
     phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
-    auto ToBatch = [&batch_gate, &to_batch](
-        const DeviceContext& ctx, const framework::LoDTensor& src,
-        const framework::DDim& dims, framework::LoDTensor& dst) {
+    auto ToBatch = [&batch_gate, &to_batch](const DeviceContext& ctx,
+                                            const framework::LoDTensor& src,
+                                            const framework::DDim& dims,
+                                            framework::LoDTensor& dst) {
       dst.mutable_data<T>(dims, ctx.GetPlace());
       dst.set_lod(batch_gate->lod());
       to_batch(ctx, src, &dst, false);
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
index 917482589fc..235a4bd689b 100644
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstm_unit_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 562f7755591..7ecf294433e 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstmp_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 5d24c0b70d3..5e68259852c 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
@@ -371,9 +372,10 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
 
     phi::funcs::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
 
-    auto ToBatch = [&batch_gate, &to_batch](
-        const DeviceContext& ctx, const framework::LoDTensor& src,
-        const framework::DDim& dims, framework::LoDTensor& dst) {
+    auto ToBatch = [&batch_gate, &to_batch](const DeviceContext& ctx,
+                                            const framework::LoDTensor& src,
+                                            const framework::DDim& dims,
+                                            framework::LoDTensor& dst) {
       dst.mutable_data<T>(dims, ctx.GetPlace());
       dst.set_lod(batch_gate->lod());
       to_batch(ctx, src, &dst, false);
diff --git a/paddle/fluid/operators/lstsq_op.cc b/paddle/fluid/operators/lstsq_op.cc
index f060125620f..e093e4d8c01 100644
--- a/paddle/fluid/operators/lstsq_op.cc
+++ b/paddle/fluid/operators/lstsq_op.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/lstsq_op.h"
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -139,4 +141,4 @@ REGISTER_OPERATOR(lstsq, ops::LstsqOp, ops::LstsqOpMaker)
 
 REGISTER_OP_CPU_KERNEL(
     lstsq, ops::LstsqCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LstsqCPUKernel<paddle::platform::CPUDeviceContext, double>);
\ No newline at end of file
+    ops::LstsqCPUKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu
index 10e2867bf29..53c78fef7b5 100644
--- a/paddle/fluid/operators/lstsq_op.cu
+++ b/paddle/fluid/operators/lstsq_op.cu
@@ -17,6 +17,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/lstsq_op.h"
 #include "paddle/fluid/operators/qr_op.h"
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index 520722dafcb..7955b3b7df9 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -15,8 +15,10 @@
 #pragma once
 
 #include <math.h>
+
 #include <algorithm>
 #include <complex>
+
 #include "paddle/fluid/operators/eig_op.h"
 #include "paddle/fluid/operators/math/eigen_values_vectors.h"
 #include "paddle/fluid/operators/math/matrix_solve.h"
diff --git a/paddle/fluid/operators/lu_op.cc b/paddle/fluid/operators/lu_op.cc
index fc8673181c4..0894323015e 100644
--- a/paddle/fluid/operators/lu_op.cc
+++ b/paddle/fluid/operators/lu_op.cc
@@ -45,8 +45,9 @@ class LUOp : public framework::OperatorWithKernel {
     bool pivots = context->Attrs().Get<bool>("pivots");
     auto x_dims = context->GetInputDim("X");
     int x_rank = x_dims.size();
-    PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument(
-                                     "the rank of input must greater than 2"));
+    PADDLE_ENFORCE_GE(x_rank, 2,
+                      platform::errors::InvalidArgument(
+                          "the rank of input must greater than 2"));
     context->SetOutputDim("Out", x_dims);
     int m = x_dims[x_rank - 1];
     int n = x_dims[x_rank - 2];
diff --git a/paddle/fluid/operators/lu_unpack_op.cc b/paddle/fluid/operators/lu_unpack_op.cc
index e38a4703f64..e3b4263b4ff 100644
--- a/paddle/fluid/operators/lu_unpack_op.cc
+++ b/paddle/fluid/operators/lu_unpack_op.cc
@@ -53,8 +53,9 @@ class LU_UnpackOp : public framework::OperatorWithKernel {
 
     auto x_dims = context->GetInputDim("X");
     int x_rank = x_dims.size();
-    PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument(
-                                     "the rank of input must greater than 2"));
+    PADDLE_ENFORCE_GE(x_rank, 2,
+                      platform::errors::InvalidArgument(
+                          "the rank of input must greater than 2"));
 
     // context->SetOutputDim("Out", x_dims);
     int m = x_dims[x_rank - 1];
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index fd5ba1952ca..1cef3705973 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -20,6 +20,7 @@ namespace cub = hipcub;
 #endif
 
 #include <vector>
+
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/margin_cross_entropy_op.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
@@ -298,16 +299,16 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     // save match_logits, used for gradient computation.
     if (label_type == framework::proto::VarType::INT32) {
       typedef int32_t LabelT;
-      AddMarginToPositiveLogitsKernel<
-          T><<<NumBlocks(N), threads, 0, dev_ctx.stream()>>>(
-          logits_ptr, labels->data<LabelT>(), margin1, margin2, margin3, rank,
-          nranks, N, D, class_interval.data<int>());
+      AddMarginToPositiveLogitsKernel<T>
+          <<<NumBlocks(N), threads, 0, dev_ctx.stream()>>>(
+              logits_ptr, labels->data<LabelT>(), margin1, margin2, margin3,
+              rank, nranks, N, D, class_interval.data<int>());
     } else if (label_type == framework::proto::VarType::INT64) {
       typedef int64_t LabelT;
-      AddMarginToPositiveLogitsKernel<
-          T><<<NumBlocks(N), threads, 0, dev_ctx.stream()>>>(
-          logits_ptr, labels->data<LabelT>(), margin1, margin2, margin3, rank,
-          nranks, N, D, class_interval.data<int>());
+      AddMarginToPositiveLogitsKernel<T>
+          <<<NumBlocks(N), threads, 0, dev_ctx.stream()>>>(
+              logits_ptr, labels->data<LabelT>(), margin1, margin2, margin3,
+              rank, nranks, N, D, class_interval.data<int>());
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
           "margin_cross_entropy label type noly support int32 and int64, "
@@ -386,9 +387,9 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 #endif
 
     // step 5, (logit - logit_max) - log(sum(exp(logit - logit_max)))
-    LogitsMinusLogSumKernel<
-        T><<<NumBlocks(N * D), threads, 0, dev_ctx.stream()>>>(
-        logits_ptr, sum_exp_logits_buff, N, D);
+    LogitsMinusLogSumKernel<T>
+        <<<NumBlocks(N * D), threads, 0, dev_ctx.stream()>>>(
+            logits_ptr, sum_exp_logits_buff, N, D);
 
     // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit -
     // logit_max))))
@@ -397,16 +398,16 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
         dev_ctx, loss, static_cast<T>(0.0));
     if (label_type == framework::proto::VarType::INT32) {
       typedef int32_t LabelT;
-      HardLabelSoftmaxWithCrossEntropyKernel<
-          T, LabelT><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          loss_ptr, logits_ptr, labels->data<LabelT>(), rank, N, D,
-          class_interval.data<int>());
+      HardLabelSoftmaxWithCrossEntropyKernel<T, LabelT>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(
+              loss_ptr, logits_ptr, labels->data<LabelT>(), rank, N, D,
+              class_interval.data<int>());
     } else if (label_type == framework::proto::VarType::INT64) {
       typedef int64_t LabelT;
-      HardLabelSoftmaxWithCrossEntropyKernel<
-          T, LabelT><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          loss_ptr, logits_ptr, labels->data<LabelT>(), rank, N, D,
-          class_interval.data<int>());
+      HardLabelSoftmaxWithCrossEntropyKernel<T, LabelT>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(
+              loss_ptr, logits_ptr, labels->data<LabelT>(), rank, N, D,
+              class_interval.data<int>());
     }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index b4ff8b6d8dc..31055002993 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/margin_rank_loss_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/marker_op.cu b/paddle/fluid/operators/marker_op.cu
index cfa5c6dc7a9..fe61aefe0bb 100644
--- a/paddle/fluid/operators/marker_op.cu
+++ b/paddle/fluid/operators/marker_op.cu
@@ -48,8 +48,8 @@ class MarkerOpCUDAKernel : public framework::OpKernel<T> {
         "MarkerCUDA", "marker_" + marker_role + "_" + marker_pos,
         platform::TracerEventType::OperatorInner, 1,
         platform::EventRole::kInnerOp);
-    SimpleMarkerKernel<T><<<1, 32, 0, dev_ctx.stream()>>>(in_temp, out_temp,
-                                                          32);
+    SimpleMarkerKernel<T>
+        <<<1, 32, 0, dev_ctx.stream()>>>(in_temp, out_temp, 32);
   }
 };
 
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index d32ab65509e..2ae4fbdbe10 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/match_matrix_tensor_op.h"
+
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <memory>
 #include <vector>
 
-#include "paddle/fluid/operators/match_matrix_tensor_op.h"
 #include "paddle/fluid/operators/search_compute.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math.h b/paddle/fluid/operators/math.h
index d4b9e35bcce..47281fb0280 100644
--- a/paddle/fluid/operators/math.h
+++ b/paddle/fluid/operators/math.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
+#include "math.h"  // NOLINT
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/core/hostdevice.h"
 
-#include "math.h"  // NOLINT
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index 486979aa0a8..7a21f2f6497 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -348,11 +348,10 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
     float* selected_scores_data =
         selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
     int* parent_idx_data =
-        parent_idx
-            ? parent_idx->mutable_data<int>(
-                  {static_cast<int64_t>(num_seqs * beam_size)},
-                  context.GetPlace())
-            : nullptr;
+        parent_idx ? parent_idx->mutable_data<int>(
+                         {static_cast<int64_t>(num_seqs * beam_size)},
+                         context.GetPlace())
+                   : nullptr;
 
     framework::LoD selected_lod(2);
     selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
@@ -369,8 +368,8 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
                             static_cast<int>(beam_size));
       switch (platform::RoundToPowerOfTwo(beam_size * seq_width)) {
         CUDA_LAUNCH_KERNEL_HELPER(
-            BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<<
-                1, kMaxThreadsPerSeq, 0, context.stream()>>>(
+            BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq>
+            <<<1, kMaxThreadsPerSeq, 0, context.stream()>>>(
                 selected_ids_data, selected_scores_data, parent_idx_data,
                 selected_offsets, pre_ids_data, pre_scores_data, ids_data,
                 scores_data, seq_length, static_cast<int>(seq_width),
@@ -387,8 +386,8 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
                             static_cast<int>(beam_size));
       switch (platform::RoundToPowerOfTwo(beam_size * num_seqs * 32)) {
         CUDA_LAUNCH_KERNEL_HELPER(
-            BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<<
-                1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
+            BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs>
+            <<<1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
                 selected_ids_data, selected_scores_data, parent_idx_data,
                 selected_offsets, pre_ids_data, pre_scores_data, ids_data,
                 scores_data, seq_offsets, static_cast<int>(num_seqs),
diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h
index 4474e7ea52a..c0d39aa2d8f 100644
--- a/paddle/fluid/operators/math/beam_search.h
+++ b/paddle/fluid/operators/math/beam_search.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
index b0547ef9d95..7cf4c867db7 100644
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/beam_search.h"
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu
index 0cdad6beeb9..4aba6f3c0b9 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -815,23 +815,23 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
     const int threads = 256;
     if (hidden % 2 == 0) {
       if (std::is_same<T, float>::value) {
-        SkipLayerNormKernel2<float, float2,
-                             threads><<<block, threads, 0, stream>>>(
-            num, hidden / 2, reinterpret_cast<const float2 *>(input1),
-            reinterpret_cast<const float2 *>(input2),
-            reinterpret_cast<float2 *>(output),
-            reinterpret_cast<const float2 *>(scale),
-            reinterpret_cast<const float2 *>(bias), eps);
+        SkipLayerNormKernel2<float, float2, threads>
+            <<<block, threads, 0, stream>>>(
+                num, hidden / 2, reinterpret_cast<const float2 *>(input1),
+                reinterpret_cast<const float2 *>(input2),
+                reinterpret_cast<float2 *>(output),
+                reinterpret_cast<const float2 *>(scale),
+                reinterpret_cast<const float2 *>(bias), eps);
 // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
 #ifndef __HIPCC__
       } else if (std::is_same<T, __half>::value) {
-        SkipLayerNormKernel2<__half, __half2,
-                             threads><<<block, threads, 0, stream>>>(
-            num, hidden / 2, reinterpret_cast<const __half2 *>(input1),
-            reinterpret_cast<const __half2 *>(input2),
-            reinterpret_cast<__half2 *>(output),
-            reinterpret_cast<const float2 *>(scale),
-            reinterpret_cast<const float2 *>(bias), eps);
+        SkipLayerNormKernel2<__half, __half2, threads>
+            <<<block, threads, 0, stream>>>(
+                num, hidden / 2, reinterpret_cast<const __half2 *>(input1),
+                reinterpret_cast<const __half2 *>(input2),
+                reinterpret_cast<__half2 *>(output),
+                reinterpret_cast<const float2 *>(scale),
+                reinterpret_cast<const float2 *>(bias), eps);
 #endif
       } else {
         assert(false);
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index 683606ec733..fd40ac540bf 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -17,10 +17,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+
 #include <cub/cub.cuh>  // NOLINT
 #endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
+
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
diff --git a/paddle/fluid/operators/math/bloomfilter.h b/paddle/fluid/operators/math/bloomfilter.h
index fa3d37ed5f4..f16fdd135b5 100644
--- a/paddle/fluid/operators/math/bloomfilter.h
+++ b/paddle/fluid/operators/math/bloomfilter.h
@@ -16,11 +16,9 @@ limitations under the License. */
 #define BLOOMFILTER_MAGIC_NUM_NEW 17070416
 
 #include <inttypes.h>
-#include <stdlib.h>
-
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-
 #include <unistd.h>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index e51631385eb..1ea8cafd25e 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/concat_and_split.h"
-
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h
index b5b0aae23ac..3b6a12e2402 100644
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index de358bf623e..542dcda963a 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -119,13 +119,13 @@ void ConcatCase1(DeviceContext* context) {
 }
 
 /**
-  * case 2:
-  *    inputs:
-  *        t_a.shape: [2, 3, 4]
-  *        t_b.shape: [2, 4, 4]
-  *    output:
-  *        out.shape: [2, 7, 4]
-  */
+ * case 2:
+ *    inputs:
+ *        t_a.shape: [2, 3, 4]
+ *        t_b.shape: [2, 4, 4]
+ *    output:
+ *        out.shape: [2, 7, 4]
+ */
 template <typename DeviceContext, typename Place>
 void ConcatCase2(DeviceContext* context) {
   paddle::framework::Tensor input_a_cpu;
@@ -222,13 +222,13 @@ void ConcatCase2(DeviceContext* context) {
 }
 
 /**
-  * case 3:
-  *    inputs:
-  *        t_a.shape: [2, 3, 5]
-  *        t_b.shape: [2, 3, 4]
-  *    output:
-  *        out.shape: [2, 3, 9]
-  */
+ * case 3:
+ *    inputs:
+ *        t_a.shape: [2, 3, 5]
+ *        t_b.shape: [2, 3, 4]
+ *    output:
+ *        out.shape: [2, 3, 9]
+ */
 template <typename DeviceContext, typename Place>
 void ConcatCase3(DeviceContext* context) {
   paddle::framework::Tensor input_a_cpu;
@@ -326,14 +326,14 @@ void ConcatCase3(DeviceContext* context) {
 }
 
 /**
-  * case 4:
-  *    inputs:
-  *        axis = 1
-  *        t_a.shape: [2, 3, 4]
-  *        t_b.shape: [2, 3, 4]
-  *    output:
-  *        out.shape: [2, 6, 4]
-  */
+ * case 4:
+ *    inputs:
+ *        axis = 1
+ *        t_a.shape: [2, 3, 4]
+ *        t_b.shape: [2, 3, 4]
+ *    output:
+ *        out.shape: [2, 6, 4]
+ */
 template <typename DeviceContext, typename Place>
 void ConcatCase4(DeviceContext* context) {
   paddle::framework::Tensor input_a_cpu;
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index cb2f59182c1..a2b83f99856 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/cross_entropy.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index da7340e4eb0..e562816d6da 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <limits>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/phi/core/hostdevice.h"
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index 1ade2190bb9..22ce162a44c 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -42,9 +42,10 @@ static void CheckEighResult(const int batch, const int info) {
           "tridiagonal form did not converge to zero",
           batch, info));
   PADDLE_ENFORCE_GE(
-      info, 0, platform::errors::PreconditionNotMet(
-                   "For batch [%d]: the [%d] argument had an illegal value",
-                   batch, info));
+      info, 0,
+      platform::errors::PreconditionNotMet(
+          "For batch [%d]: the [%d] argument had an illegal value", batch,
+          info));
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu
index 9b03895cdef..946a1477c3b 100644
--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
@@ -36,35 +37,35 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
           int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
           threads = dim3(tiled_size, 1);
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruGate<
-              T, tiled_size><<<grid, threads, 0, stream>>>(
-              value.gate_value, value.prev_out_value, value.gate_weight,
-              value.reset_output_value, frame_size, active_gate);
+          detail::KeFastCollectiveGruGate<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(
+                  value.gate_value, value.prev_out_value, value.gate_weight,
+                  value.reset_output_value, frame_size, active_gate);
 
           frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruOut<
-              T, tiled_size><<<grid, threads, 0, stream>>>(
-              value.state_weight, value.prev_out_value, value.output_value,
-              value.gate_value, value.reset_output_value, frame_size,
-              active_node, origin_mode);
+          detail::KeFastCollectiveGruOut<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(
+                  value.state_weight, value.prev_out_value, value.output_value,
+                  value.gate_value, value.reset_output_value, frame_size,
+                  active_node, origin_mode);
         } else {
           constexpr int tiled_size = 16;
           int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
           threads = dim3(tiled_size, 1);
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruGate<
-              T, tiled_size><<<grid, threads, 0, stream>>>(
-              value.gate_value, value.prev_out_value, value.gate_weight,
-              value.reset_output_value, frame_size, active_gate);
+          detail::KeFastCollectiveGruGate<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(
+                  value.gate_value, value.prev_out_value, value.gate_weight,
+                  value.reset_output_value, frame_size, active_gate);
 
           frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruOut<
-              T, tiled_size><<<grid, threads, 0, stream>>>(
-              value.state_weight, value.prev_out_value, value.output_value,
-              value.gate_value, value.reset_output_value, frame_size,
-              active_node, origin_mode);
+          detail::KeFastCollectiveGruOut<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(
+                  value.state_weight, value.prev_out_value, value.output_value,
+                  value.gate_value, value.reset_output_value, frame_size,
+                  active_node, origin_mode);
         }
         return;
       } else {
@@ -86,18 +87,18 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
 
     if (batch_size == 1) {
       detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* is_batch= */ false,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gate_value,
-          value.reset_output_value, value.prev_out_value, frame_size,
-          batch_size, active_gate);
+                                      /* is_batch= */ false, T>
+          <<<grid, threads, 0, stream>>>(
+              detail::forward::gru_resetOutput<T>(), value.gate_value,
+              value.reset_output_value, value.prev_out_value, frame_size,
+              batch_size, active_gate);
     } else {
       detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* is_batch= */ true,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gate_value,
-          value.reset_output_value, value.prev_out_value, frame_size,
-          batch_size, active_gate);
+                                      /* is_batch= */ true, T>
+          <<<grid, threads, 0, stream>>>(
+              detail::forward::gru_resetOutput<T>(), value.gate_value,
+              value.reset_output_value, value.prev_out_value, frame_size,
+              batch_size, active_gate);
     }
 
     if (value.prev_out_value) {
@@ -109,18 +110,18 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
 
     if (batch_size == 1) {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* is_batch= */ false,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gate_value,
-          value.prev_out_value, value.output_value, frame_size, batch_size,
-          active_node, origin_mode);
+                                      /* is_batch= */ false, T>
+          <<<grid, threads, 0, stream>>>(detail::forward::gru_finalOutput<T>(),
+                                         value.gate_value, value.prev_out_value,
+                                         value.output_value, frame_size,
+                                         batch_size, active_node, origin_mode);
     } else {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* is_batch= */ true,
-                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gate_value,
-          value.prev_out_value, value.output_value, frame_size, batch_size,
-          active_node, origin_mode);
+                                      /* is_batch= */ true, T>
+          <<<grid, threads, 0, stream>>>(detail::forward::gru_finalOutput<T>(),
+                                         value.gate_value, value.prev_out_value,
+                                         value.output_value, frame_size,
+                                         batch_size, active_node, origin_mode);
     }
   }
 };
@@ -147,19 +148,21 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
     }
 
     if (batch_size == 1) {
-      detail::KeGruBackwardStateGrad<
-          detail::backward::gru_stateGrad<T>,
-          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.output_grad, frame_size, batch_size, active_node, origin_mode);
+      detail::KeGruBackwardStateGrad<detail::backward::gru_stateGrad<T>,
+                                     /* is_batch= */ false>
+          <<<grid, threads, 0, stream>>>(
+              detail::backward::gru_stateGrad<T>(), value.gate_value,
+              grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+              grad.output_grad, frame_size, batch_size, active_node,
+              origin_mode);
     } else {
-      detail::KeGruBackwardStateGrad<
-          detail::backward::gru_stateGrad<T>,
-          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.output_grad, frame_size, batch_size, active_node, origin_mode);
+      detail::KeGruBackwardStateGrad<detail::backward::gru_stateGrad<T>,
+                                     /* is_batch= */ true>
+          <<<grid, threads, 0, stream>>>(
+              detail::backward::gru_stateGrad<T>(), value.gate_value,
+              grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+              grad.output_grad, frame_size, batch_size, active_node,
+              origin_mode);
     }
 
     auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
@@ -179,19 +182,19 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
     }
 
     if (batch_size == 1) {
-      detail::KeGruBackwardResetGrad<
-          detail::backward::gru_resetGrad<T>,
-          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.reset_output_grad, frame_size, batch_size, active_gate);
+      detail::KeGruBackwardResetGrad<detail::backward::gru_resetGrad<T>,
+                                     /* is_batch= */ false>
+          <<<grid, threads, 0, stream>>>(
+              detail::backward::gru_resetGrad<T>(), value.gate_value,
+              grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+              grad.reset_output_grad, frame_size, batch_size, active_gate);
     } else {
-      detail::KeGruBackwardResetGrad<
-          detail::backward::gru_resetGrad<T>,
-          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gate_value,
-          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          grad.reset_output_grad, frame_size, batch_size, active_gate);
+      detail::KeGruBackwardResetGrad<detail::backward::gru_resetGrad<T>,
+                                     /* is_batch= */ true>
+          <<<grid, threads, 0, stream>>>(
+              detail::backward::gru_resetGrad<T>(), value.gate_value,
+              grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+              grad.reset_output_grad, frame_size, batch_size, active_gate);
     }
 
     if (grad.prev_out_grad && value.prev_out_value) {
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 8fc6c52122a..1f5f575c7c3 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -111,16 +111,18 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, DeviceContext,
                        ((dilation[0] * (filter_height - 1) + 1))) /
                               stride[0] +
                           1,
-                      col_height, platform::errors::InvalidArgument(
-                                      "Output_height and padding(padding_up, "
-                                      "padding_down) are inconsistent."));
+                      col_height,
+                      platform::errors::InvalidArgument(
+                          "Output_height and padding(padding_up, "
+                          "padding_down) are inconsistent."));
     PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                        ((dilation[1] * (filter_width - 1) + 1))) /
                               stride[1] +
                           1,
-                      col_width, platform::errors::InvalidArgument(
-                                     "Output_height and padding(padding_up, "
-                                     "padding_down) are inconsistent."));
+                      col_width,
+                      platform::errors::InvalidArgument(
+                          "Output_height and padding(padding_up, "
+                          "padding_down) are inconsistent."));
 
     int channels_col = im_channels * filter_height * filter_width;
 
@@ -275,9 +277,10 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF, DeviceContext,
 
     PADDLE_ENFORCE_EQ(
         (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height, platform::errors::InvalidArgument(
-                        "Output_height and padding(padding_up, padding_down) "
-                        "are inconsistent."));
+        col_height,
+        platform::errors::InvalidArgument(
+            "Output_height and padding(padding_up, padding_down) "
+            "are inconsistent."));
     PADDLE_ENFORCE_EQ(
         (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
         col_width,
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index 09253a495cd..0c48547002f 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -220,16 +221,18 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, DeviceContext,
                        (dilation[0] * (filter_height - 1) + 1)) /
                               stride[0] +
                           1,
-                      col_height, platform::errors::InvalidArgument(
-                                      "Output_height and padding(padding_up, "
-                                      "padding_down) are inconsistent."));
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width, platform::errors::InvalidArgument(
-                                     "col_width and padding(padding_left, "
-                                     "padding_right) are inconsistent."));
+                      col_height,
+                      platform::errors::InvalidArgument(
+                          "Output_height and padding(padding_up, "
+                          "padding_down) are inconsistent."));
+    PADDLE_ENFORCE_EQ(
+        (im_width + padding[1] + padding[3] -
+         (dilation[1] * (filter_width - 1) + 1)) /
+                stride[1] +
+            1,
+        col_width,
+        platform::errors::InvalidArgument("col_width and padding(padding_left, "
+                                          "padding_right) are inconsistent."));
 
     size_t num_kernels = im_channels * im_height * im_width;
 
@@ -430,16 +433,18 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF, DeviceContext,
                        (dilation[0] * (filter_height - 1) + 1)) /
                               stride[0] +
                           1,
-                      col_height, platform::errors::InvalidArgument(
-                                      "Output_height and padding(padding_up, "
-                                      "padding_down) are inconsistent."));
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width, platform::errors::InvalidArgument(
-                                     "col_width and padding(padding_left, "
-                                     "padding_right) are inconsistent."));
+                      col_height,
+                      platform::errors::InvalidArgument(
+                          "Output_height and padding(padding_up, "
+                          "padding_down) are inconsistent."));
+    PADDLE_ENFORCE_EQ(
+        (im_width + padding[1] + padding[3] -
+         (dilation[1] * (filter_width - 1) + 1)) /
+                stride[1] +
+            1,
+        col_width,
+        platform::errors::InvalidArgument("col_width and padding(padding_left, "
+                                          "padding_right) are inconsistent."));
 
     int block_dim_x = 0;
     int block_dim_y = 0;
diff --git a/paddle/fluid/operators/math/im2col.h b/paddle/fluid/operators/math/im2col.h
index 38654431704..2a81637d7a8 100644
--- a/paddle/fluid/operators/math/im2col.h
+++ b/paddle/fluid/operators/math/im2col.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h
index 01f1e220e65..f3755653f28 100644
--- a/paddle/fluid/operators/math/im2col_cfo_cpu.h
+++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 0e4032986cf..ff766cfad2c 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
+
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index b77e2345036..bd170b67404 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -24,6 +24,7 @@ namespace cub = hipcub;
 
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/reverse_iterator.h>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -196,15 +197,15 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
   grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
   dim3 thread_dims(kThreadNumX, kThreadNumY);
   if (reverse) {
-    InclusiveScanInnerDimCUDAKernel<
-        T, BinaryOp, kThreadNumX, kThreadNumY,
-        /*kReverse=*/true><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
-        x, y, outer_dim, inner_dim, init, op);
+    InclusiveScanInnerDimCUDAKernel<T, BinaryOp, kThreadNumX, kThreadNumY,
+                                    /*kReverse=*/true>
+        <<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(x, y, outer_dim,
+                                                         inner_dim, init, op);
   } else {
-    InclusiveScanInnerDimCUDAKernel<
-        T, BinaryOp, kThreadNumX, kThreadNumY,
-        /*kReverse=*/false><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
-        x, y, outer_dim, inner_dim, init, op);
+    InclusiveScanInnerDimCUDAKernel<T, BinaryOp, kThreadNumX, kThreadNumY,
+                                    /*kReverse=*/false>
+        <<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(x, y, outer_dim,
+                                                         inner_dim, init, op);
   }
 }
 
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 083d6967ff0..a3c1d23e89b 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/bfloat16.h"
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 359552a0717..1d6afa50cc9 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc
index 7b239b81666..f2b083b8337 100644
--- a/paddle/fluid/operators/math/matrix_solve.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/matrix_solve.h"
+
 #include "Eigen/Core"
 #include "Eigen/LU"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc
index 737196dde1d..59c8c07e6e1 100644
--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/matrix_solve.h"
+
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/solve_op.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h
index 415d0c6dd8e..cecc3517934 100644
--- a/paddle/fluid/operators/math/matrix_solve.h
+++ b/paddle/fluid/operators/math/matrix_solve.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "Eigen/Core"
 #include "Eigen/LU"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index f86eb103449..1ae0c709e4d 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <thrust/random.h>
 #include <thrust/sort.h>
+
 #include <iostream>
 #include <vector>
 
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
index 5f1cd259416..d645e1994f1 100644
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/sampler.h"
 
 #include <glog/logging.h>
+
 #include "paddle/fluid/framework/generator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index e4b033b6c58..7689c31838d 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index db5c66d3197..edcb21cb56a 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -163,10 +163,10 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     dim3 threads(block_size, 1);
     dim3 grid(in1_rows.size(), 1);
     paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
-        in1_row_numel);
+    SelectedRowsAddTensorKernel<T, block_size>
+        <<<grid, threads, 0, context.stream()>>>(
+            in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
+            in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
@@ -223,10 +223,10 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
     dim3 threads(block_size, 1);
     dim3 grid(in1_rows.size(), 1);
     paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
-        in1_row_numel);
+    SelectedRowsAddTensorKernel<T, block_size>
+        <<<grid, threads, 0, context.stream()>>>(
+            in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data,
+            in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
@@ -343,10 +343,10 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     dim3 threads(block_size, 1);
     dim3 grid(in1_rows.size(), 1);
     paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddToTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
-        in1_row_numel);
+    SelectedRowsAddToTensorKernel<T, block_size>
+        <<<grid, threads, 0, context.stream()>>>(
+            in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
+            in1_row_numel);
   }
 };
 
@@ -380,10 +380,10 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
     dim3 threads(block_size, 1);
     dim3 grid(in1_rows.size(), 1);
     paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
-    SelectedRowsAddToTensorKernel<
-        T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
-        in1_row_numel);
+    SelectedRowsAddToTensorKernel<T, block_size>
+        <<<grid, threads, 0, context.stream()>>>(
+            in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data,
+            in1_row_numel);
   }
 };
 
@@ -695,9 +695,9 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
 
     dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
     dim3 grid(in1_rows.size(), 1);
-    UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
-        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
-                                              op, in2_data, in1_row_numel);
+    UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS>
+        <<<grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
+                                                 op, in2_data, in1_row_numel);
   }
 };
 }  // namespace scatter
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index e0e28f93f36..e6358cda274 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -457,8 +457,9 @@ TEST(selected_rows_functor, cpu_sum_to) {
   paddle::operators::math::SelectedRowsSumTo<paddle::platform::CPUDeviceContext,
                                              float>
       sum_to_functor;
-  sum_to_functor(ctx, std::vector<phi::SelectedRows*>(
-                          {selected_rows1.get(), selected_rows2.get()}),
+  sum_to_functor(ctx,
+                 std::vector<phi::SelectedRows*>(
+                     {selected_rows1.get(), selected_rows2.get()}),
                  std::vector<int64_t>({0, in1_value->numel()}), output.get());
   auto out_height = output->height();
   EXPECT_EQ(out_height, height);
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 0912a964792..6e1d0bb3670 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+
 #include "gtest/gtest.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 35ba8c1d118..97e276fff02 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_padding.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace phi {
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index 9aca6ad0f5a..ef7981858a9 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/operators/math/sequence_padding.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 
diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h
index 956a4ff6a2d..687c64fc23e 100644
--- a/paddle/fluid/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -64,13 +65,14 @@ inline static void CheckDims(const framework::DDim& seq_tensor_dims,
   PADDLE_ENFORCE_EQ(
       seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
           seq_tensor_dims.size() == pad_tensor_dims.size(),
-      true, platform::errors::InvalidArgument(
-                "pad_tensor's rank should be 1 greater than seq_tensor's "
-                "rank, or be equal with it. The pad_tensor's rank is %ld, "
-                "expected the seq_tensor's rank is %ld or %ld, but got %ld. "
-                "Please check the input value.",
-                pad_tensor_dims.size(), pad_tensor_dims.size(),
-                pad_tensor_dims.size() - 1, seq_tensor_dims.size()));
+      true,
+      platform::errors::InvalidArgument(
+          "pad_tensor's rank should be 1 greater than seq_tensor's "
+          "rank, or be equal with it. The pad_tensor's rank is %ld, "
+          "expected the seq_tensor's rank is %ld or %ld, but got %ld. "
+          "Please check the input value.",
+          pad_tensor_dims.size(), pad_tensor_dims.size(),
+          pad_tensor_dims.size() - 1, seq_tensor_dims.size()));
 }
 
 /*
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 8312d7cd9b7..9abe9e59888 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/math/sequence_pooling.h"
+
 #include <string>
 
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/sequence_pooling.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index fa7b0431538..217b29e1b6b 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <string>
+
 #include "paddle/fluid/operators/math/sequence_pooling.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/macros.h"
@@ -170,41 +171,41 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(std::max(static_cast<int>(lod.size()) - 1, 1), 1);
     paddle::framework::MixVector<size_t> mix_vector(&lod);
     if (pooltype == "MAX") {
-      sequence_pool_kernel<
-          T, MaxPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          MaxPoolFunctor<T>(), input.data<T>(), pad_value,
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), index->data<int>());
+      sequence_pool_kernel<T, MaxPoolFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              MaxPoolFunctor<T>(), input.data<T>(), pad_value,
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              output->mutable_data<T>(context.GetPlace()), index->data<int>());
     } else if (pooltype == "AVERAGE") {
-      sequence_pool_kernel<
-          T, AvgPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          AvgPoolFunctor<T>(), input.data<T>(), pad_value,
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_kernel<T, AvgPoolFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              AvgPoolFunctor<T>(), input.data<T>(), pad_value,
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SUM") {
-      sequence_pool_kernel<
-          T, SumPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SumPoolFunctor<T>(), input.data<T>(), pad_value,
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_kernel<T, SumPoolFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              SumPoolFunctor<T>(), input.data<T>(), pad_value,
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SQRT") {
-      sequence_pool_kernel<
-          T, SqrtPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SqrtPoolFunctor<T>(), input.data<T>(), pad_value,
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_kernel<T, SqrtPoolFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              SqrtPoolFunctor<T>(), input.data<T>(), pad_value,
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "LAST") {
-      sequence_pool_kernel<
-          T, LastPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          LastPoolFunctor<T>(), input.data<T>(), pad_value,
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_kernel<T, LastPoolFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              LastPoolFunctor<T>(), input.data<T>(), pad_value,
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "FIRST") {
-      sequence_pool_kernel<
-          T, FirstPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          FirstPoolFunctor<T>(), input.data<T>(), pad_value,
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          output->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_kernel<T, FirstPoolFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              FirstPoolFunctor<T>(), input.data<T>(), pad_value,
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              output->mutable_data<T>(context.GetPlace()), nullptr);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "unsupported pooling pooltype: %s. Only support \"MAX\", "
@@ -338,41 +339,41 @@ class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(std::max(static_cast<int>(lod.size()) - 1, 1), 1);
     paddle::framework::MixVector<size_t> mix_vector(&lod);
     if (pooltype == "MAX") {
-      sequence_pool_grad_kernel<
-          T, MaxPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          MaxPoolGradFunctor<T>(), out_grad.data<T>(),
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), index->data<int>());
+      sequence_pool_grad_kernel<T, MaxPoolGradFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              MaxPoolGradFunctor<T>(), out_grad.data<T>(),
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              in_grad->mutable_data<T>(context.GetPlace()), index->data<int>());
     } else if (pooltype == "AVERAGE") {
-      sequence_pool_grad_kernel<
-          T, AvgPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          AvgPoolGradFunctor<T>(), out_grad.data<T>(),
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_grad_kernel<T, AvgPoolGradFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              AvgPoolGradFunctor<T>(), out_grad.data<T>(),
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SUM") {
-      sequence_pool_grad_kernel<
-          T, SumPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SumPoolGradFunctor<T>(), out_grad.data<T>(),
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_grad_kernel<T, SumPoolGradFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              SumPoolGradFunctor<T>(), out_grad.data<T>(),
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SQRT") {
-      sequence_pool_grad_kernel<
-          T, SqrtPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SqrtPoolGradFunctor<T>(), out_grad.data<T>(),
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_grad_kernel<T, SqrtPoolGradFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              SqrtPoolGradFunctor<T>(), out_grad.data<T>(),
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "LAST") {
-      sequence_pool_grad_kernel<
-          T, LastPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          LastPoolGradFunctor<T>(), out_grad.data<T>(),
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_grad_kernel<T, LastPoolGradFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              LastPoolGradFunctor<T>(), out_grad.data<T>(),
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              in_grad->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "FIRST") {
-      sequence_pool_grad_kernel<
-          T, FirstPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          FirstPoolGradFunctor<T>(), out_grad.data<T>(),
-          mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
-          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+      sequence_pool_grad_kernel<T, FirstPoolGradFunctor<T>>
+          <<<grid, threads, 0, context.stream()>>>(
+              FirstPoolGradFunctor<T>(), out_grad.data<T>(),
+              mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim,
+              in_grad->mutable_data<T>(context.GetPlace()), nullptr);
 
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h
index 847d0bca951..f5b6701b46e 100644
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index 815d221e255..6d9c75f9550 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_pooling.h"
+
 #include <gtest/gtest.h>
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
index bc8832a1bbc..8f954e068c0 100644
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_scale.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace phi {
diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
index 253a67c2c8c..c0b97497cc7 100644
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -53,10 +53,10 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
         seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
         seq_width);
 #else
-    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
-        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
-        seq_width);
+    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>
+        <<<num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
+            seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
+            seq_width);
 #endif
     mix_vector.CopyToCPU();
   }
@@ -82,10 +82,10 @@ class ScaleLoDTensorFunctor<phi::GPUContext, T> {
         seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
         seq_width);
 #else
-    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
-        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
-        seq_width);
+    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS>
+        <<<num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
+            seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales,
+            seq_width);
 #endif
     mix_vector.CopyToCPU();
   }
diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc
index c855cb763a9..adea86a6c5a 100644
--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/softmax.h"
+
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 69642c81942..33da631d27b 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/jit/kernels.h"
@@ -66,34 +67,32 @@ class SoftmaxEigen {
     if (num_remain == 1) {
       // axis == -1, axis and class in same dimension, calculate along
       // class dimension directly for higher performance
-      softmax.device(*context.eigen_device()) = (logits -
-                                                 logits.maximum(along_axis)
-                                                     .eval()
-                                                     .reshape(batch_by_one)
-                                                     .broadcast(one_by_class))
-                                                    .unaryExpr(ValueClip<T>());
+      softmax.device(*context.eigen_device()) =
+          (logits - logits.maximum(along_axis)
+                        .eval()
+                        .reshape(batch_by_one)
+                        .broadcast(one_by_class))
+              .unaryExpr(ValueClip<T>());
     } else {
       // axis != -1, class dimension split into (axis, remain), max and sum
       // should be calculated along axis dimension
       softmax.device(*context.eigen_device()) =
-          (logits.reshape(batch_axis_remain) -
-           logits.reshape(batch_axis_remain)
-               .maximum(along_axis)
-               .eval()
-               .reshape(batch_one_remain)
-               .broadcast(one_axis_one)
-               .reshape(batch_classes))
+          (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain)
+                                                   .maximum(along_axis)
+                                                   .eval()
+                                                   .reshape(batch_one_remain)
+                                                   .broadcast(one_axis_one)
+                                                   .reshape(batch_classes))
               .unaryExpr(ValueClip<T>());
     }
 
     softmax.device(*context.eigen_device()) = softmax.exp();
     softmax.device(*context.eigen_device()) =
-        (softmax *
-         softmax.reshape(batch_axis_remain)
-             .sum(along_axis)
-             .inverse()
-             .eval()
-             .broadcast(one_axis));
+        (softmax * softmax.reshape(batch_axis_remain)
+                       .sum(along_axis)
+                       .inverse()
+                       .eval()
+                       .broadcast(one_axis));
   }
 };
 
@@ -128,31 +127,28 @@ class SoftmaxEigen<DeviceContext, platform::float16, is_test> {
       // axis == -1, axis and class in same dimension, calculate along
       // class dimension directly for higher performance
       softmax.device(*context.eigen_device()) =
-          (logits -
-           logits.maximum(along_axis)
-               .reshape(batch_by_one)
-               .broadcast(one_by_class))
+          (logits - logits.maximum(along_axis)
+                        .reshape(batch_by_one)
+                        .broadcast(one_by_class))
               .unaryExpr(ValueClip<platform::float16>());
     } else {
       // axis != -1, class dimension split into (axis, remain), max and sum
       // should be calculated along axis dimension
       softmax.device(*context.eigen_device()) =
-          (logits.reshape(batch_axis_remain) -
-           logits.reshape(batch_axis_remain)
-               .maximum(along_axis)
-               .reshape(batch_one_remain)
-               .broadcast(one_axis_one)
-               .reshape(batch_classes))
+          (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain)
+                                                   .maximum(along_axis)
+                                                   .reshape(batch_one_remain)
+                                                   .broadcast(one_axis_one)
+                                                   .reshape(batch_classes))
               .unaryExpr(ValueClip<platform::float16>());
     }
 
     softmax.device(*context.eigen_device()) = softmax.exp();
     softmax.device(*context.eigen_device()) =
-        (softmax *
-         softmax.reshape(batch_axis_remain)
-             .sum(along_axis)
-             .inverse()
-             .broadcast(one_axis));
+        (softmax * softmax.reshape(batch_axis_remain)
+                       .sum(along_axis)
+                       .inverse()
+                       .broadcast(one_axis));
   }
 };
 
@@ -187,31 +183,28 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16, is_test> {
       // axis == -1, axis and class in same dimension, calculate along
       // class dimension directly for higher performance
       softmax.device(*context.eigen_device()) =
-          (logits -
-           logits.maximum(along_axis)
-               .reshape(batch_by_one)
-               .broadcast(one_by_class))
+          (logits - logits.maximum(along_axis)
+                        .reshape(batch_by_one)
+                        .broadcast(one_by_class))
               .unaryExpr(ValueClip<platform::bfloat16>());
     } else {
       // axis != -1, class dimension split into (axis, remain), max and sum
       // should be calculated along axis dimension
       softmax.device(*context.eigen_device()) =
-          (logits.reshape(batch_axis_remain) -
-           logits.reshape(batch_axis_remain)
-               .maximum(along_axis)
-               .reshape(batch_one_remain)
-               .broadcast(one_axis_one)
-               .reshape(batch_classes))
+          (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain)
+                                                   .maximum(along_axis)
+                                                   .reshape(batch_one_remain)
+                                                   .broadcast(one_axis_one)
+                                                   .reshape(batch_classes))
               .unaryExpr(ValueClip<platform::bfloat16>());
     }
 
     softmax.device(*context.eigen_device()) = softmax.exp();
     softmax.device(*context.eigen_device()) =
-        (softmax *
-         softmax.reshape(batch_axis_remain)
-             .sum(along_axis)
-             .inverse()
-             .broadcast(one_axis));
+        (softmax * softmax.reshape(batch_axis_remain)
+                       .sum(along_axis)
+                       .inverse()
+                       .broadcast(one_axis));
   }
 };
 
diff --git a/paddle/fluid/operators/math/sparse_impl.cu.h b/paddle/fluid/operators/math/sparse_impl.cu.h
index dd2d256dd73..03f94ed5736 100644
--- a/paddle/fluid/operators/math/sparse_impl.cu.h
+++ b/paddle/fluid/operators/math/sparse_impl.cu.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/cusparse.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index cd1fa13001c..8ad0a17c27e 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/math/tree2col.h"
+
 #include <deque>
 #include <stack>
 
diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu
index bdaab212ab1..c8bba20a423 100644
--- a/paddle/fluid/operators/math/tree2col.cu
+++ b/paddle/fluid/operators/math/tree2col.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <stack>
+
 #include "paddle/fluid/operators/math/tree2col.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/math/tree2col.h b/paddle/fluid/operators/math/tree2col.h
index 88104b858ba..df4b233a763 100644
--- a/paddle/fluid/operators/math/tree2col.h
+++ b/paddle/fluid/operators/math/tree2col.h
@@ -17,6 +17,7 @@
 #include <array>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
index fb61a36a8e1..d8581d731e8 100644
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/math/vol2col.h b/paddle/fluid/operators/math/vol2col.h
index 3122828b2ee..cddcb0af467 100644
--- a/paddle/fluid/operators/math/vol2col.h
+++ b/paddle/fluid/operators/math/vol2col.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index 210cf10d887..4889817cd9e 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/vol2col.h"
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 9d381e1f22b..2c16774e324 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include <algorithm>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -258,13 +259,14 @@ class MatMulGradKernel : public framework::OpKernel<T> {
       MatMul(context, a, trans_a, b, trans_b, out);
     } else {
       auto &ctx = context.template device_context<DeviceContext>();
-      MatMul(context, is_fold_init_dims_a
-                          ? FoldInitDims(a)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
-             trans_a, is_fold_init_dims_b
-                          ? FoldInitDims(b)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
-             trans_b, out);
+      MatMul(
+          context,
+          is_fold_init_dims_a ? FoldInitDims(a)
+                              : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
+          trans_a,
+          is_fold_init_dims_b ? FoldInitDims(b)
+                              : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
+          trans_b, out);
     }
   }
 
@@ -425,13 +427,14 @@ class MatMulDoubleGradKernel : public framework::OpKernel<T> {
       MatMul(context, a, trans_a, b, trans_b, flag, out);
     } else {
       auto &ctx = context.template device_context<DeviceContext>();
-      MatMul(context, is_fold_init_dims_a
-                          ? FoldInitDims(a)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
-             trans_a, is_fold_init_dims_b
-                          ? FoldInitDims(b)
-                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
-             trans_b, flag, out);
+      MatMul(
+          context,
+          is_fold_init_dims_a ? FoldInitDims(a)
+                              : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
+          trans_a,
+          is_fold_init_dims_b ? FoldInitDims(b)
+                              : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
+          trans_b, flag, out);
     }
   }
 
@@ -602,12 +605,13 @@ class MatMulOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
               mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0,
-          true, platform::errors::InvalidArgument(
-                    "The batch size of the two matrices should be equal, or "
-                    "at least one is zero.\n"
-                    "But received X's shape: %s, Y's shape: %s.",
-                    DumpMatrixShape(mat_dim_x).c_str(),
-                    DumpMatrixShape(mat_dim_y).c_str()));
+          true,
+          platform::errors::InvalidArgument(
+              "The batch size of the two matrices should be equal, or "
+              "at least one is zero.\n"
+              "But received X's shape: %s, Y's shape: %s.",
+              DumpMatrixShape(mat_dim_x).c_str(),
+              DumpMatrixShape(mat_dim_y).c_str()));
     }
     int64_t dim_out_y = mat_dim_y.width_;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
@@ -996,13 +1000,12 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MatMulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
 #endif
 
-REGISTER_OP_VERSION(matmul)
-    .AddCheckpoint(
-        R"ROC(Register matmul for adding the attribute of
+REGISTER_OP_VERSION(matmul).AddCheckpoint(
+    R"ROC(Register matmul for adding the attribute of
        fused_reshape_Y)ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "fused_reshape_Y",
-            "In order to support the function of fused the input Y "
-            " and input X into the input X when "
-            "using the operator of matmul, and get raw shape of input Y.",
-            std::vector<int>{}));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "fused_reshape_Y",
+        "In order to support the function of fused the input Y "
+        " and input X into the input X when "
+        "using the operator of matmul, and get raw shape of input Y.",
+        std::vector<int>{}));
diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc
index 80d4492e049..3477715d6d3 100644
--- a/paddle/fluid/operators/matmul_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
@@ -315,14 +315,15 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> {
       MatMul(context, a, trans_a, b, trans_b, out);
     } else {
       auto &dev_ctx = context.template device_context<DeviceContext>();
-      MatMul(
-          context, is_fold_init_dims_a
-                       ? FoldInitDims(a)
-                       : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, a),
-          trans_a, is_fold_init_dims_b
-                       ? FoldInitDims(b)
-                       : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, b),
-          trans_b, out);
+      MatMul(context,
+             is_fold_init_dims_a
+                 ? FoldInitDims(a)
+                 : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, a),
+             trans_a,
+             is_fold_init_dims_b
+                 ? FoldInitDims(b)
+                 : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, b),
+             trans_b, out);
     }
   }
 
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 162ebdafec1..168a3dbfeaa 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/matmul_v2_op.h"
+
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index 34a8e97af2e..b47cdf6e8cb 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <functional>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/dot_op.h"
diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc
index 87df75ac465..f85e714ce95 100644
--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -14,10 +14,10 @@
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/matmul_v2_op.h"
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/operators/matmul_v2_op.h"
 #include "paddle/fluid/operators/xpu_api_wrapper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc
index 56f65340ea9..ffbb8538d94 100644
--- a/paddle/fluid/operators/matrix_power_op.cc
+++ b/paddle/fluid/operators/matrix_power_op.cc
@@ -14,6 +14,7 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc
index e7d08b65973..fddfaa3526a 100644
--- a/paddle/fluid/operators/matrix_rank_op.cc
+++ b/paddle/fluid/operators/matrix_rank_op.cc
@@ -14,6 +14,7 @@
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h
index 9fa00e60e05..1cf9f4433bc 100644
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/mean_op_xpu.cc b/paddle/fluid/operators/mean_op_xpu.cc
index ef96fe2f03b..811b138c8d1 100644
--- a/paddle/fluid/operators/mean_op_xpu.cc
+++ b/paddle/fluid/operators/mean_op_xpu.cc
@@ -56,8 +56,9 @@ class MeanGradXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(OG->numel(), 1, platform::errors::InvalidArgument(
-                                          "Mean Gradient should be scalar"));
+    PADDLE_ENFORCE_EQ(
+        OG->numel(), 1,
+        platform::errors::InvalidArgument("Mean Gradient should be scalar"));
     auto IG = context.Output<Tensor>(framework::GradVarName("X"));
     IG->mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index e2b86bd0e3b..0d4c2f7b3b4 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/phi/core/lod_utils.h"
 
 namespace phi {
diff --git a/paddle/fluid/operators/merge_selected_rows_op.cc b/paddle/fluid/operators/merge_selected_rows_op.cc
index ea223ad1b32..cfb8aa1f8a7 100644
--- a/paddle/fluid/operators/merge_selected_rows_op.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/merge_selected_rows_op.h"
+
 #include <unordered_map>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/merge_selected_rows_op.h b/paddle/fluid/operators/merge_selected_rows_op.h
index 4c87a4a6411..d0f18b22b27 100644
--- a/paddle/fluid/operators/merge_selected_rows_op.h
+++ b/paddle/fluid/operators/merge_selected_rows_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc
index 5a6862f380d..cc57a25a1fb 100644
--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
@@ -16,10 +16,9 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 
diff --git a/paddle/fluid/operators/miopen_lstm_cache.h b/paddle/fluid/operators/miopen_lstm_cache.h
index c307218baa4..045f917de70 100644
--- a/paddle/fluid/operators/miopen_lstm_cache.h
+++ b/paddle/fluid/operators/miopen_lstm_cache.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
diff --git a/paddle/fluid/operators/miopen_rnn_cache.h b/paddle/fluid/operators/miopen_rnn_cache.h
index 38cea39abd5..438163cd77e 100644
--- a/paddle/fluid/operators/miopen_rnn_cache.h
+++ b/paddle/fluid/operators/miopen_rnn_cache.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 393247644c2..db74b24b405 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -30,11 +30,11 @@ class MKLDNNDeviceContext;
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
 using dnnl::memory;
 using dnnl::primitive;
 using dnnl::stream;
+using framework::DataLayout;
+using framework::Tensor;
 using platform::GetMKLDNNFormat;
 using platform::MKLDNNDeviceContext;
 using platform::to_void_cast;
diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc
index ee630fe186a..80f74195d8e 100644
--- a/paddle/fluid/operators/mkldnn/axpy_handler.cc
+++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
+
 #include <cinttypes>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "dnnl.hpp"
-#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -82,7 +83,7 @@ static void naive_axpy(int n, T alpha, const T *x, T *y) {
   }
 }
 
-}  // anonnymouse namespace
+}  // namespace
 
 template <typename T>
 class OneDNNAXPYHandler<T>::Impl {
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 5095fa06719..0881baa6f8e 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/operators/concat_op.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -21,13 +22,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
-using framework::LoDTensor;
+using dnnl::concat;
 using dnnl::memory;
 using dnnl::primitive;
-using dnnl::concat;
 using dnnl::stream;
+using framework::DataLayout;
+using framework::LoDTensor;
+using framework::Tensor;
 using platform::to_void_cast;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index fba17d303f2..65092e059f4 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -203,8 +203,9 @@ class ConvMKLDNNHandlerT
       dnnl::memory::desc src_md, weights_md;
       if (platform::is_int8<T>()) {
         src_md = platform::MKLDNNMemDesc(
-            src_tz, framework::ToMKLDNNDataType(
-                        framework::TransToProtoVarType(input->dtype())),
+            src_tz,
+            framework::ToMKLDNNDataType(
+                framework::TransToProtoVarType(input->dtype())),
             chosen_memory_format);
         weights_md = platform::MKLDNNMemDesc(
             weights_tz, dnnl::memory::data_type::s8, chosen_memory_format);
@@ -459,13 +460,12 @@ class ConvMKLDNNHandlerT
     auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
     bool is_multi_channel = scale_weights_data.size() > 1;
     bool has_activation = !ctx.Attr<std::string>("fuse_activation").empty();
-    float activation_scale =
-        force_fp32_output ? 1.0f : has_activation ? ctx.Attr<float>("Scale_out")
-                                                  : 1.0f;
-    auto scale_out_data =
-        force_fp32_output ? 1.0f : has_activation
-                                       ? 1.0f
-                                       : ctx.Attr<float>("Scale_out");
+    float activation_scale = force_fp32_output ? 1.0f
+                             : has_activation  ? ctx.Attr<float>("Scale_out")
+                                               : 1.0f;
+    auto scale_out_data = force_fp32_output ? 1.0f
+                          : has_activation  ? 1.0f
+                                            : ctx.Attr<float>("Scale_out");
     float sum_scale =
         fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
     int count =
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index 747e4603d7f..e507b2429b7 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -28,8 +28,8 @@ using dnnl::primitive;
 using dnnl::reorder;
 using platform::to_void_cast;
 using Tensor = framework::Tensor;
-using framework::DataLayout;
 using dnnl::stream;
+using framework::DataLayout;
 using platform::GetMKLDNNFormat;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
index 91dccbee0ae..035add5fd83 100644
--- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -18,11 +18,11 @@ limitations under the License. */
 
 namespace {
 
-using paddle::framework::Tensor;
-using phi::vectorize;
-using paddle::framework::GradVarName;
 using paddle::framework::ExecutionContext;
+using paddle::framework::GradVarName;
+using paddle::framework::Tensor;
 using paddle::platform::MKLDNNDeviceContext;
+using phi::vectorize;
 
 template <typename T>
 class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 4078d012fce..5cbcad5d965 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -31,19 +31,19 @@ class MKLDNNDeviceContext;
 namespace paddle {
 namespace operators {
 
+using dnnl::inner_product_forward;
+using dnnl::memory;
+using dnnl::primitive;
+using dnnl::prop_kind;
+using dnnl::stream;
 using framework::DataLayout;
-using framework::Tensor;
-using framework::LoDTensor;
 using framework::DDim;
 using framework::ExecutionContext;
+using framework::LoDTensor;
+using framework::Tensor;
+using platform::GetMKLDNNFormat;
 using platform::MKLDNNDeviceContext;
 using platform::to_void_cast;
-using platform::GetMKLDNNFormat;
-using dnnl::memory;
-using dnnl::inner_product_forward;
-using dnnl::primitive;
-using dnnl::stream;
-using dnnl::prop_kind;
 
 template <typename T_in, typename T_w, typename T_out>
 class FCPrimitiveFactory {
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index 37d6c072903..a53a30b737d 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -19,12 +19,12 @@
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
 using dnnl::memory;
 using dnnl::primitive;
 using dnnl::reorder;
-using dnnl::stream;
 using dnnl::resampling_forward;
+using dnnl::stream;
+using framework::DataLayout;
 using platform::GetMKLDNNFormat;
 using platform::to_void_cast;
 
@@ -114,9 +114,10 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_GT(std::all_of(out_dims.begin(), out_dims.end(),
                                   [](int i) { return i > 0; }),
-                      0, platform::errors::InvalidArgument(
-                             "out_d, out_h, out_w of Op(interpolate) "
-                             "should be greater than 0."));
+                      0,
+                      platform::errors::InvalidArgument(
+                          "out_d, out_h, out_w of Op(interpolate) "
+                          "should be greater than 0."));
 
     const std::vector<int64_t> nc_dims = {in_dims[0], in_dims[1]};
     out_dims.insert(out_dims.begin(), nc_dims.begin(), nc_dims.end());
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index e9abe84e679..8921db6cbce 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -13,19 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h"
+
 #include <tuple>
+
 #include "paddle/fluid/framework/convert_utils.h"
 
 using dnnl::memory;
 using dnnl::primitive;
 using paddle::framework::DataLayout;
 using paddle::framework::ExecutionContext;
-using phi::vectorize;
 using paddle::platform::GetMKLDNNFormat;
-using paddle::platform::MKLDNNFormatForSize;
 using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::MKLDNNFormatForSize;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::to_void_cast;
+using phi::vectorize;
 using Tensor = paddle::framework::Tensor;
 
 namespace {
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h
index 583dcd04018..07cb2173a7e 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h
@@ -22,8 +22,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using platform::MKLDNNDeviceContext;
 using framework::ExecutionContext;
+using platform::MKLDNNDeviceContext;
 using Tensor = framework::Tensor;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index 6e7ba59cf1a..424faf30d3a 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -20,8 +20,8 @@ using dnnl::memory;
 using dnnl::primitive;
 using paddle::framework::DataLayout;
 using paddle::framework::ExecutionContext;
-using paddle::platform::MatMulV2MKLDNNHandler;
 using paddle::platform::GetMKLDNNFormat;
+using paddle::platform::MatMulV2MKLDNNHandler;
 using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::to_void_cast;
@@ -206,11 +206,12 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
         PADDLE_ENFORCE_EQ(
             x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] == 1 ||
                 y_bd_dims[i] == 1,
-            true, paddle::platform::errors::InvalidArgument(
-                      "Tensor dimensions are incorrect for broadcasting."
-                      "Dimensions in X and Y must be same or equal to 1, but "
-                      "received x_dim[%d]=%d and y_dims[%d]= %d",
-                      i, x_bd_dims[i], i, y_bd_dims[i]));
+            true,
+            paddle::platform::errors::InvalidArgument(
+                "Tensor dimensions are incorrect for broadcasting."
+                "Dimensions in X and Y must be same or equal to 1, but "
+                "received x_dim[%d]=%d and y_dims[%d]= %d",
+                i, x_bd_dims[i], i, y_bd_dims[i]));
         out_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]);
       }
       out->Resize(phi::make_ddim(out_dims));
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 77763531c82..dbf3adcdad0 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -20,14 +20,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
 using dnnl::memory;
 using dnnl::pooling_backward;
 using dnnl::pooling_forward;
 using dnnl::primitive;
 using dnnl::reorder;
 using dnnl::stream;
+using framework::DataLayout;
+using framework::Tensor;
 using platform::to_void_cast;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 8cbe46bee48..8f3a3e8ba65 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -27,8 +27,8 @@ using dnnl::primitive;
 using dnnl::reorder;
 using platform::to_void_cast;
 using Tensor = framework::Tensor;
-using framework::DataLayout;
 using dnnl::stream;
+using framework::DataLayout;
 using platform::GetMKLDNNFormat;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
index 9a7ac6d5055..778a33f27af 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -46,10 +46,12 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
     bool with_shift = shift_in != 0.0f || shift_out != 0.0f;
     auto* output = ctx.Output<Tensor>("Output");
 
-    PADDLE_ENFORCE_NE(scale_in, 0.0f, platform::errors::InvalidArgument(
-                                          "Scale of input cannot be 0.0"));
-    PADDLE_ENFORCE_NE(scale_out, 0.0f, platform::errors::InvalidArgument(
-                                           "Scale of output cannot be 0.0"));
+    PADDLE_ENFORCE_NE(
+        scale_in, 0.0f,
+        platform::errors::InvalidArgument("Scale of input cannot be 0.0"));
+    PADDLE_ENFORCE_NE(
+        scale_out, 0.0f,
+        platform::errors::InvalidArgument("Scale of output cannot be 0.0"));
     if (shift_in != 0.0f) {
       PADDLE_ENFORCE_EQ(
           framework::TransToProtoVarType(input->dtype()),
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index a21034d48ba..f1c5153240e 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -31,8 +31,8 @@ namespace paddle {
 namespace operators {
 
 using paddle::framework::LoDTensor;
-using platform::to_void_cast;
 using platform::GetMKLDNNFormat;
+using platform::to_void_cast;
 
 static std::vector<int> extract_shape(
     const std::vector<const Tensor*>& list_new_shape_tensor) {
diff --git a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
index 28a00be5fa4..798fe51901d 100644
--- a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
@@ -17,13 +17,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
-using framework::LoDTensor;
+using dnnl::concat;
 using dnnl::memory;
 using dnnl::primitive;
-using dnnl::concat;
 using dnnl::stream;
+using framework::DataLayout;
+using framework::LoDTensor;
+using framework::Tensor;
 using platform::to_void_cast;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index de21c2687bd..b564602fdaa 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -116,8 +116,9 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const auto& mkldnn_engine = dev_ctx.GetEngine();
     auto in_vars = ctx.MultiInputVar("X");
 
-    PADDLE_ENFORCE_NE(in_vars.empty(), true, platform::errors::InvalidArgument(
-                                                 "Input variable is empty."));
+    PADDLE_ENFORCE_NE(
+        in_vars.empty(), true,
+        platform::errors::InvalidArgument("Input variable is empty."));
     auto& input0 = in_vars[0]->Get<LoDTensor>();
     LoDTensor* output = ctx.Output<LoDTensor>("Out");
 
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index b5fb0c54c78..1e04cc8a8a5 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -16,6 +16,7 @@
 #include <map>
 #include <random>
 #include <string>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -121,8 +122,9 @@ void RunOperator(const platform::Place &place, const std::string &op_type,
   auto op =
       num_inputs[op_type] > 1
           ? framework::OpRegistry::CreateOp(
-                op_type, {{first_input_var_name, {first_input}},
-                          {second_input_var_name, {"x1"}}},
+                op_type,
+                {{first_input_var_name, {first_input}},
+                 {second_input_var_name, {"x1"}}},
                 {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}})
           : framework::OpRegistry::CreateOp(
                 op_type, {{first_input_var_name, {first_input}}},
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
index 4090d5ffca8..a1acf3706c5 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -16,6 +16,7 @@
 #include <cstdlib>
 #include <memory>
 #include <random>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index b9866ba8c36..f4b79a02163 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -16,6 +16,7 @@
 #include <cstdlib>
 #include <memory>
 #include <random>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index ee992277314..13f9dba9eeb 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -66,7 +66,7 @@ class TransposeMKLDNNHandler {
  protected:
   dnnl::memory::desc Axis2MemoryDesc(std::vector<int64_t>& nchw_tz,  // NOLINT
                                      std::vector<int>& axis          // NOLINT
-                                     ) {
+  ) {
     size_t ndims = axis.size();
 
     std::vector<int64_t> strides(ndims);
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 9d3b8e2407f..1ff27454013 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc
index 9c16ccb138f..d946f177545 100644
--- a/paddle/fluid/operators/mode_op.cc
+++ b/paddle/fluid/operators/mode_op.cc
@@ -13,10 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index 9a53c7162ff..4216ee097be 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/modified_huber_loss_op.h"
+
 #include <memory>
 
 namespace paddle {
@@ -29,10 +30,11 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
 
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, platform::errors::InvalidArgument(
-                                            "Input(input) rank should be 2, "
-                                            "but received input rank(%d) != 2",
-                                            x_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument("Input(input) rank should be 2, "
+                                          "but received input rank(%d) != 2",
+                                          x_dims.size()));
 
     if (ctx->IsRuntime() ||
         (phi::product(x_dims) > 0 && phi::product(y_dims) > 0)) {
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
index 8f1894b5af0..ad34a54a9bf 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <thrust/for_each.h>
 #include <thrust/host_vector.h>
 #include <thrust/tuple.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/modified_huber_loss_op.h"
 #include "paddle/phi/core/hostdevice.h"
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index ef04d5582d3..b31935cefc2 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
diff --git a/paddle/fluid/operators/mul_op_xpu.cc b/paddle/fluid/operators/mul_op_xpu.cc
index 7410b3b607c..9f52dc8559d 100644
--- a/paddle/fluid/operators/mul_op_xpu.cc
+++ b/paddle/fluid/operators/mul_op_xpu.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/xpu_api_wrapper.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index 4e6ad35e612..72243b408f4 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 
diff --git a/paddle/fluid/operators/nanmedian_op.cc b/paddle/fluid/operators/nanmedian_op.cc
index 23a497bdb1d..63bfea650ac 100644
--- a/paddle/fluid/operators/nanmedian_op.cc
+++ b/paddle/fluid/operators/nanmedian_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
index bcbc96ea1b6..8a0112fa11d 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
@@ -25,7 +25,7 @@ size_t last_num_gpus = -1;
 // TODO(panyx0718): Need to decide whether Paddle supports parallel
 // runs with different number GPUs. If true, current solution is not enough.
 std::mutex comm_mu;
-}
+}  // namespace
 
 int Communicator::GetCommId(int device_id) const {
   std::lock_guard<std::mutex> guard(comm_mu);
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index 65c3447ff23..b99800ecd64 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -19,9 +19,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using framework::LoDTensor;
 using framework::Tensor;
 using platform::Communicator;
-using framework::LoDTensor;
 
 template <typename Type>
 class NCCLTypeWrapper;
diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
index 80144c6f258..21649bfcd37 100644
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <mutex>   // NOLINT
 #include <thread>  // NOLINT
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index c8af2415594..38c9b809eb6 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -15,11 +15,13 @@ limitations under the License. */
 #pragma once
 
 #include <math.h>
+
 #include <iterator>
 #include <random>
 #include <set>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index 8f14bc10d50..d3cbec495fd 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/backward.h"
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 51daccce0e8..0a1f647627a 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 0ed1f2719de..18ae152a689 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -450,27 +450,27 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
     set_constant(ctx, dX, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
-            dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
-            dx_data);
+        DoubleGradComputeDXWithGlobal<T, DataLayout::kNHWC>
+            <<<grid1, block, 0, ctx.stream()>>>(dy_data, ddscale_data,
+                                                variance_data, epsilon, C,
+                                                sample_size, num, dx_data);
       } else {
-        DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
-            dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
-            dx_data);
+        DoubleGradComputeDXWithGlobal<T, DataLayout::kNCHW>
+            <<<grid1, block, 0, ctx.stream()>>>(dy_data, ddscale_data,
+                                                variance_data, epsilon, C,
+                                                sample_size, num, dx_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDX<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
-            x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
-            ddscale_data, N, C, sample_size, epsilon, dx_data);
+        DoubleGradComputeDX<T, block, DataLayout::kNHWC>
+            <<<grid, block, 0, ctx.stream()>>>(
+                x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
+                ddscale_data, N, C, sample_size, epsilon, dx_data);
       } else {
-        DoubleGradComputeDX<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
-            x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
-            ddscale_data, N, C, sample_size, epsilon, dx_data);
+        DoubleGradComputeDX<T, block, DataLayout::kNCHW>
+            <<<grid, block, 0, ctx.stream()>>>(
+                x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
+                ddscale_data, N, C, sample_size, epsilon, dx_data);
       }
     }
   }
@@ -479,27 +479,27 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
     set_constant(ctx, dScale, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
-            ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
-            dscale_data);
+        DoubleGradComputeDScaleWithGlobal<T, block, DataLayout::kNHWC>
+            <<<grid, block, 0, ctx.stream()>>>(ddx_data, variance_data, dy_data,
+                                               epsilon, N, C, sample_size,
+                                               dscale_data);
       } else {
-        DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
-            ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
-            dscale_data);
+        DoubleGradComputeDScaleWithGlobal<T, block, DataLayout::kNCHW>
+            <<<grid, block, 0, ctx.stream()>>>(ddx_data, variance_data, dy_data,
+                                               epsilon, N, C, sample_size,
+                                               dscale_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDScale<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
-            x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
-            sample_size, epsilon, dscale_data);
+        DoubleGradComputeDScale<T, block, DataLayout::kNHWC>
+            <<<grid, block, 0, ctx.stream()>>>(
+                x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
+                sample_size, epsilon, dscale_data);
       } else {
-        DoubleGradComputeDScale<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
-            x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
-            sample_size, epsilon, dscale_data);
+        DoubleGradComputeDScale<T, block, DataLayout::kNCHW>
+            <<<grid, block, 0, ctx.stream()>>>(
+                x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
+                sample_size, epsilon, dscale_data);
       }
     }
   }
@@ -508,27 +508,29 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
     set_constant(ctx, ddY, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
-            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
-            ddscale_data, epsilon, C, sample_size, num, ddy_data);
+        DoubleGradComputeDDYWithGlobal<T, DataLayout::kNHWC>
+            <<<grid1, block, 0, ctx.stream()>>>(
+                ddx_data, scale_data, mean_data, variance_data, x_data,
+                ddbias_data, ddscale_data, epsilon, C, sample_size, num,
+                ddy_data);
       } else {
-        DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
-            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
-            ddscale_data, epsilon, C, sample_size, num, ddy_data);
+        DoubleGradComputeDDYWithGlobal<T, DataLayout::kNCHW>
+            <<<grid1, block, 0, ctx.stream()>>>(
+                ddx_data, scale_data, mean_data, variance_data, x_data,
+                ddbias_data, ddscale_data, epsilon, C, sample_size, num,
+                ddy_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDDY<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
-            x_data, mean_data, variance_data, ddscale_data, ddbias_data,
-            ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
+        DoubleGradComputeDDY<T, block, DataLayout::kNHWC>
+            <<<grid, block, 0, ctx.stream()>>>(
+                x_data, mean_data, variance_data, ddscale_data, ddbias_data,
+                ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
       } else {
-        DoubleGradComputeDDY<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
-            x_data, mean_data, variance_data, ddscale_data, ddbias_data,
-            ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
+        DoubleGradComputeDDY<T, block, DataLayout::kNCHW>
+            <<<grid, block, 0, ctx.stream()>>>(
+                x_data, mean_data, variance_data, ddscale_data, ddbias_data,
+                ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
       }
     }
   }
diff --git a/paddle/fluid/operators/norm_utils.h b/paddle/fluid/operators/norm_utils.h
index fee06fe5dd4..36370245922 100644
--- a/paddle/fluid/operators/norm_utils.h
+++ b/paddle/fluid/operators/norm_utils.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <memory>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu
index 923d89c2485..2fc180fe678 100644
--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -97,13 +97,13 @@ class NumberCountOpCUDAKernel : public framework::OpKernel<T> {
     auto out_data = number_count->mutable_data<T>(out_dims, place);
     const T* gate_data = numbers->data<T>();
 
-    initialize_zero_kernel<
-        T><<<GET_BLOCKS(upper_range), CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
-        out_data, upper_range);
+    initialize_zero_kernel<T>
+        <<<GET_BLOCKS(upper_range), CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
+            out_data, upper_range);
 
-    NumberCount<
-        T><<<CEIL(upper_range, PERTHREAD_EXPERTS), 256, 0, dev_ctx.stream()>>>(
-        gate_data, out_data, batch_size, upper_range);
+    NumberCount<T>
+        <<<CEIL(upper_range, PERTHREAD_EXPERTS), 256, 0, dev_ctx.stream()>>>(
+            gate_data, out_data, batch_size, upper_range);
   }
 };
 
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
index 64323e588c6..e6b6320898f 100644
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/one_hot_op.h"
+
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/operators/one_hot_op_npu.cc b/paddle/fluid/operators/one_hot_op_npu.cc
index 24b506ebf8a..4e11cbb3888 100644
--- a/paddle/fluid/operators/one_hot_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/one_hot_op.h"
-
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc
index 122b6a8a80a..cb7b9963bbd 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ b/paddle/fluid/operators/one_hot_v2_op.cc
@@ -14,6 +14,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc
index e5702a37bb2..dcf098f105c 100644
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc
index 91bad143061..64f22cced3b 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
@@ -15,13 +15,12 @@ limitations under the License. */
 #include <cmath>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
index 1ea91f6ebfa..e13805f694b 100644
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -183,16 +183,25 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
-    const auto& runner =
-        NpuOpRunner("ApplyAdamD",
-                    {
-                        *param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr,
-                        *beta1_tensor, *beta2_tensor, *epsilon_tensor, *grad,
-                    },
-                    {
-                        *param_out, *mom1_out, *mom2_out,
-                    },
-                    {});
+    const auto& runner = NpuOpRunner("ApplyAdamD",
+                                     {
+                                         *param,
+                                         *mom1,
+                                         *mom2,
+                                         *beta1_pow,
+                                         *beta2_pow,
+                                         *lr,
+                                         *beta1_tensor,
+                                         *beta2_tensor,
+                                         *epsilon_tensor,
+                                         *grad,
+                                     },
+                                     {
+                                         *param_out,
+                                         *mom1_out,
+                                         *mom2_out,
+                                     },
+                                     {});
     runner.Run(stream);
 
     // NOTE(zhiqiu): ApplyAdamD updates params inplace, so
diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
index 6ea0b2054cd..37467c7ba96 100644
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -306,8 +306,9 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
       }
       xpu_wait(dev_ctx.x_context()->xpu_stream);
     } else {
-      PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
-                                  "Variable type not supported by adam_op"));
+      PADDLE_ENFORCE_EQ(1, 2,
+                        platform::errors::InvalidArgument(
+                            "Variable type not supported by adam_op"));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/adamw_op.cc b/paddle/fluid/operators/optimizers/adamw_op.cc
index e2670625d4e..43e9dc0cae8 100644
--- a/paddle/fluid/operators/optimizers/adamw_op.cc
+++ b/paddle/fluid/operators/optimizers/adamw_op.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/optimizers/adam_op.h"
-
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/operators/optimizers/adam_op.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 
diff --git a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc
index d86d2bd2ffb..57a6b744fd6 100644
--- a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc
@@ -205,8 +205,9 @@ class AdamwOpXPUKernel : public framework::OpKernel<T> {
         }
       }
     } else {
-      PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
-                                  "Variable type not supported by adamw_op"));
+      PADDLE_ENFORCE_EQ(1, 2,
+                        platform::errors::InvalidArgument(
+                            "Variable type not supported by adamw_op"));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h
index a3fbb0e59e2..eb031ae0c93 100644
--- a/paddle/fluid/operators/optimizers/cast_with_ptr.h
+++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h
@@ -43,9 +43,9 @@ static void VecCastKernel(const platform::CUDADeviceContext &ctx, const InT *x,
   in_arr[0] = reinterpret_cast<const _ptr_ char *>(x);
   phi::Array<_ptr_ OutT *, 1> out_arr;
   out_arr[0] = y;
-  phi::funcs::VectorizedElementwiseKernel<
-      OutT, FunctorT, 1, 1, VecSize><<<block, thread, 0, stream>>>(
-      in_arr, out_arr, n, main_offset, FunctorT());
+  phi::funcs::VectorizedElementwiseKernel<OutT, FunctorT, 1, 1, VecSize>
+      <<<block, thread, 0, stream>>>(in_arr, out_arr, n, main_offset,
+                                     FunctorT());
 }
 
 }  // namespace details
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
index 7f0b2b7d064..40ac044e647 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
-
 #include "paddle/fluid/operators/optimizers/dgc_momentum_op.h"
 
+#include <string>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
index 3688b8067c2..7cbc52f4235 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
@@ -83,10 +83,12 @@ static void GetParamGradShardInfo(const std::vector<ParamGradInfo> &infos,
   VLOG(10) << "start_size = " << start_size << " , end_size = " << end_size;
 
   if (infos.empty()) {
-    PADDLE_ENFORCE_EQ(start_size, 0, platform::errors::InvalidArgument(
-                                         "start_size should be 0."));
-    PADDLE_ENFORCE_EQ(end_size, 0, platform::errors::InvalidArgument(
-                                       "end_size should be 0."));
+    PADDLE_ENFORCE_EQ(
+        start_size, 0,
+        platform::errors::InvalidArgument("start_size should be 0."));
+    PADDLE_ENFORCE_EQ(
+        end_size, 0,
+        platform::errors::InvalidArgument("end_size should be 0."));
     *start_idx = 0;
     *end_idx = 0;
     *start_numel_offset = 0;
@@ -104,15 +106,17 @@ static void GetParamGradShardInfo(const std::vector<ParamGradInfo> &infos,
       infos.begin());
   if (i == n || infos[i].numel_offset != start_size) {
     PADDLE_ENFORCE_GT(
-        i, 0, platform::errors::InvalidArgument(
-                  "Cannot find suitable sharding which is between [%d, %d)",
-                  start_size, end_size));
+        i, 0,
+        platform::errors::InvalidArgument(
+            "Cannot find suitable sharding which is between [%d, %d)",
+            start_size, end_size));
     --i;
   }
   PADDLE_ENFORCE_LT(
-      i, n, platform::errors::InvalidArgument(
-                "Cannot find suitable sharding which is between [%d, %d)",
-                start_size, end_size));
+      i, n,
+      platform::errors::InvalidArgument(
+          "Cannot find suitable sharding which is between [%d, %d)", start_size,
+          end_size));
   *start_idx = i;
   *start_numel_offset = start_size - infos[i].numel_offset;
   auto j = static_cast<size_t>(
@@ -450,8 +454,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
                       platform::errors::InvalidArgument(
                           "The attr(alignment) should be the power of 2."));
     PADDLE_ENFORCE_GE(
-        rank, 0, platform::errors::InvalidArgument(
-                     "The attr(rank) should be equal to or larger than 0."));
+        rank, 0,
+        platform::errors::InvalidArgument(
+            "The attr(rank) should be equal to or larger than 0."));
     PADDLE_ENFORCE_LT(
         rank, nranks,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index c857c6de4d0..eb354ef6d75 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <cmath>
+
 #include "paddle/fluid/memory/buffer.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/optimizers/cast_with_ptr.h"
@@ -32,6 +33,7 @@
 
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
+
 #include "math.h"  // NOLINT
 namespace cub = hipcub;
 #endif
@@ -190,9 +192,8 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
   PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL);
 #undef PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL
 
-  MultiTensorL2NormReduceAgainCUDAKernel<
-      MT, OutT, kBlockDim><<<n, kBlockDim, 0, stream>>>(tmp_out_ptr, y,
-                                                        max_chunk_num);
+  MultiTensorL2NormReduceAgainCUDAKernel<MT, OutT, kBlockDim>
+      <<<n, kBlockDim, 0, stream>>>(tmp_out_ptr, y, max_chunk_num);
 }
 
 template <int LogLevel>
@@ -508,14 +509,14 @@ static void MultiTensorUpdateLambMomentAndTrustRatioDiv(
                                       "Output(Step) cannot be nullptr."));
   }
 
-#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL                         \
-  do {                                                                    \
-    UpdateLambMomentAndTrustRatioDivCUDAKernel<T, GradT, kVecSize><<<     \
-        config.block_per_grid, config.thread_per_block, 0, stream>>>(     \
-        param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p,    \
-        beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, step, \
-        weight_decay, weight_decay_end_numel, beta1, beta2, epsilon,      \
-        max_global_grad_norm, numel, rescale_grad);                       \
+#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL                             \
+  do {                                                                        \
+    UpdateLambMomentAndTrustRatioDivCUDAKernel<T, GradT, kVecSize>            \
+        <<<config.block_per_grid, config.thread_per_block, 0, stream>>>(      \
+            param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p,    \
+            beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, step, \
+            weight_decay, weight_decay_end_numel, beta1, beta2, epsilon,      \
+            max_global_grad_norm, numel, rescale_grad);                       \
   } while (0)
 
   PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL);
@@ -705,8 +706,9 @@ static void MultiTensorUpdateLambParamAndBetaPows(
     PADDLE_ENFORCE_NOT_NULL(beta2pow, platform::errors::InvalidArgument(
                                           "Beta2Pow should not be nullptr."));
   } else {
-    PADDLE_ENFORCE_EQ(beta2pow, nullptr, platform::errors::InvalidArgument(
-                                             "Beta2Pow should be nullptr."));
+    PADDLE_ENFORCE_EQ(
+        beta2pow, nullptr,
+        platform::errors::InvalidArgument("Beta2Pow should be nullptr."));
   }
 
   const int block_dim = 512;
@@ -744,21 +746,21 @@ static void MultiTensorUpdateLambParamAndBetaPows(
                     betapow_helper);                                           \
   } while (0)
 
-#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE        \
-  do {                                                              \
-    auto callback = [&](                                            \
-        const MultiTensorLauncher<kNumTensor, kNumChunk> &launcher, \
-        int launch_n) {                                             \
-      if (has_beta_pow && launch_n == 0) {                          \
-        PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true);          \
-        beta1pow = nullptr;                                         \
-        beta2pow = nullptr;                                         \
-      } else {                                                      \
-        PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false);         \
-      }                                                             \
-    };                                                              \
-    MultiTensorApplyWithCallback<kNumTensor, kNumChunk>(            \
-        stream, offsets, n, chunk_size, block_dim, callback);       \
+#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE            \
+  do {                                                                  \
+    auto callback =                                                     \
+        [&](const MultiTensorLauncher<kNumTensor, kNumChunk> &launcher, \
+            int launch_n) {                                             \
+          if (has_beta_pow && launch_n == 0) {                          \
+            PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true);          \
+            beta1pow = nullptr;                                         \
+            beta2pow = nullptr;                                         \
+          } else {                                                      \
+            PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false);         \
+          }                                                             \
+        };                                                              \
+    MultiTensorApplyWithCallback<kNumTensor, kNumChunk>(                \
+        stream, offsets, n, chunk_size, block_dim, callback);           \
   } while (0)
 
   PD_VEC_LAUNCH_KERNEL(vec_size,
@@ -793,11 +795,11 @@ static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx,
   int vec_size = std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0));
   auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size);
 
-#define PD_LAMB_VEC_SCALE_KERNEL_CASE                                          \
-  do {                                                                         \
-    ScaleCUDAKernel<T1, T2, kVecSize><<<config.block_per_grid,                 \
-                                        config.thread_per_block, 0, stream>>>( \
-        x, scale, y, n);                                                       \
+#define PD_LAMB_VEC_SCALE_KERNEL_CASE                                    \
+  do {                                                                   \
+    ScaleCUDAKernel<T1, T2, kVecSize>                                    \
+        <<<config.block_per_grid, config.thread_per_block, 0, stream>>>( \
+            x, scale, y, n);                                             \
   } while (0)
 
   PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAMB_VEC_SCALE_KERNEL_CASE);
@@ -1015,7 +1017,7 @@ static void CheckHasNanInfGrad(const float *fp32_grad, int fp32_numel,
   if (fp32_numel > 0) {
     fp32_has_nan_inf = reinterpret_cast<bool *>(nan_inf_flag + 1);
     cub::TransformInputIterator<bool, IsNanInfFunctor<float>, const float *>
-    iter(fp32_grad, IsNanInfFunctor<float>());
+        iter(fp32_grad, IsNanInfFunctor<float>());
     CubDeviceReduce(iter, fp32_has_nan_inf, fp32_numel, OrFunctor(), false,
                     stream, cub_tmp_buffer);
   }
@@ -1082,11 +1084,11 @@ static void LaunchElementwiseAddWithCastKernel(
                GetChunkedVecSize(z, 0));
   auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size);
 
-#define PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL                            \
-  do {                                                                        \
-    ElementwiseAddWithCastCUDAKernel<T1, T2, T3, kVecSize><<<                 \
-        config.block_per_grid, config.thread_per_block, 0, stream>>>(x, y, z, \
-                                                                     n);      \
+#define PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL                             \
+  do {                                                                         \
+    ElementwiseAddWithCastCUDAKernel<T1, T2, T3, kVecSize>                     \
+        <<<config.block_per_grid, config.thread_per_block, 0, stream>>>(x, y,  \
+                                                                        z, n); \
   } while (0)
 
   PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL);
@@ -1445,10 +1447,10 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         if (is_grad_scaled_by_nranks) {
           clip_scale *= num_devices;
         }
-        CalcGradNormClipBeforeAllReduceScale<
-            float, platform::float16><<<1, 1, 0, stream>>>(
-            global_scale, max_global_grad_norm, fp32_square_grad_norm,
-            fp32_scale, fp16_scale, clip_scale);
+        CalcGradNormClipBeforeAllReduceScale<float, platform::float16>
+            <<<1, 1, 0, stream>>>(global_scale, max_global_grad_norm,
+                                  fp32_square_grad_norm, fp32_scale, fp16_scale,
+                                  clip_scale);
         if (fp32_scale) {
           VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place);
         } else {
@@ -1567,11 +1569,12 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                         fp16_partial_fused_offsets, fp16_local_param_num,
                         param_square_norm + fp16_local_start_idx);
     } else {
-      MultiTensorL2Norm(
-          place, stream, fp16_param + fused_offsets[fp16_local_start_idx] -
-                             fused_offsets[fp32_global_param_num],
-          fused_offsets + fp16_local_start_idx, fp16_local_param_num,
-          param_square_norm + fp16_local_start_idx);
+      MultiTensorL2Norm(place, stream,
+                        fp16_param + fused_offsets[fp16_local_start_idx] -
+                            fused_offsets[fp32_global_param_num],
+                        fused_offsets + fp16_local_start_idx,
+                        fp16_local_param_num,
+                        param_square_norm + fp16_local_start_idx);
     }
 
     MultiTensorL2Norm(place, stream, trust_ratio_div,
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.h b/paddle/fluid/operators/optimizers/dpsgd_op.h
index 688a7f1ad84..69a853c5d18 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.h
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <stdlib.h>
+
 #include <iostream>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -108,9 +110,8 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
 
     // update parameters
     for (int64_t i = 0; i < grad->numel(); ++i) {
-      out_data[i] =
-          param_data[i] -
-          lr[0] * (grad_data[i] / scale + gaussian_noise / batch_size);
+      out_data[i] = param_data[i] - lr[0] * (grad_data[i] / scale +
+                                             gaussian_noise / batch_size);
     }
     // CCS16 - Deep Learning with Differential Privacy.
     // [https://arxiv.org/abs/1607.00133]
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h
index 596ed05df3f..73fd7ceb67b 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
@@ -74,9 +74,8 @@ class SparseFTRLFunctor {
       l_acc_out_[j] += g - (std::sqrt(new_acc) - std::sqrt(s_acc)) / lr * p;
     } else {
       l_acc_out_[j] +=
-          g -
-          (std::pow(new_acc, -lr_power_) - std::pow(s_acc, -lr_power_)) / lr *
-              p;
+          g - (std::pow(new_acc, -lr_power_) - std::pow(s_acc, -lr_power_)) /
+                  lr * p;
     }
 
     auto l_acc = l_acc_out_[j];
diff --git a/paddle/fluid/operators/optimizers/lamb_op.cc b/paddle/fluid/operators/optimizers/lamb_op.cc
index 48ceba3695f..fb2a78d28ed 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/optimizers/lamb_op.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -227,13 +229,12 @@ REGISTER_OP_CPU_KERNEL(
     ops::LambOpKernel<paddle::platform::CPUDeviceContext, double>);
 
 /* ==========================  register checkpoint ===========================*/
-REGISTER_OP_VERSION(lamb)
-    .AddCheckpoint(
-        R"ROC(Upgrade lamb, add two new outputs [Beta1PowOut] and [Beta2PowOut].)ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewInput("Beta1PowOut",
-                      "The Output beta1 power accumulator. 'Beta1PowOut' is "
-                      "dispensable.")
-            .NewInput("Beta2PowOut",
-                      "The Output beta2 power accumulator. 'Beta2PowOut' is "
-                      "dispensable."));
+REGISTER_OP_VERSION(lamb).AddCheckpoint(
+    R"ROC(Upgrade lamb, add two new outputs [Beta1PowOut] and [Beta2PowOut].)ROC",
+    paddle::framework::compatible::OpVersionDesc()
+        .NewInput("Beta1PowOut",
+                  "The Output beta1 power accumulator. 'Beta1PowOut' is "
+                  "dispensable.")
+        .NewInput("Beta2PowOut",
+                  "The Output beta2 power accumulator. 'Beta2PowOut' is "
+                  "dispensable."));
diff --git a/paddle/fluid/operators/optimizers/lamb_op.cu b/paddle/fluid/operators/optimizers/lamb_op.cu
index b46fa19ea13..a9f880fdbb6 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/lamb_op.cu
@@ -16,7 +16,8 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    lamb, ops::LambOpKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::float16>,
+    lamb,
+    ops::LambOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>,
     ops::LambOpKernel<paddle::platform::CUDADeviceContext, float>,
     ops::LambOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h
index 45acf2b3e48..2956ff20467 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.h
+++ b/paddle/fluid/operators/optimizers/lamb_op.h
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <math.h>  // for sqrt in CPU and CUDA
+
 #include <Eigen/Dense>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/buffer.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
@@ -655,10 +657,10 @@ class LambOpKernel : public framework::OpKernel<T> {
     // TODO(zengjinle): remove the following Eigen operations when
     // *skip_update == true.
     memory::Buffer buffer(dev_ctx.GetPlace());
-    math::SquaredL2Norm(
-        dev_ctx, reinterpret_cast<const MT*>(IsMultiPrecision ? master_param_ptr
-                                                              : param_ptr),
-        p_norm_ptr, numel, &buffer);
+    math::SquaredL2Norm(dev_ctx,
+                        reinterpret_cast<const MT*>(
+                            IsMultiPrecision ? master_param_ptr : param_ptr),
+                        p_norm_ptr, numel, &buffer);
     math::SquaredL2Norm(dev_ctx, trust_ratio_div_ptr, trust_ratio_div_norm_ptr,
                         numel, &buffer);
 
@@ -675,12 +677,12 @@ class LambOpKernel : public framework::OpKernel<T> {
 #define CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(__should_update_beta_pow)         \
   do {                                                                       \
     LambParamUpateFunctor<T, MT, IsMultiPrecision, __should_update_beta_pow> \
-    param_update_functor(                                                    \
-        lr.template data<MT>(), static_cast<const T*>(param_ptr),            \
-        static_cast<const MT*>(master_param_ptr), p_norm_ptr,                \
-        trust_ratio_div_ptr, trust_ratio_div_norm_ptr,                       \
-        static_cast<T*>(param_out_ptr),                                      \
-        static_cast<MT*>(master_param_out_ptr), skip_update_flag);           \
+        param_update_functor(                                                \
+            lr.template data<MT>(), static_cast<const T*>(param_ptr),        \
+            static_cast<const MT*>(master_param_ptr), p_norm_ptr,            \
+            trust_ratio_div_ptr, trust_ratio_div_norm_ptr,                   \
+            static_cast<T*>(param_out_ptr),                                  \
+            static_cast<MT*>(master_param_out_ptr), skip_update_flag);       \
     if (__should_update_beta_pow) {                                          \
       param_update_functor.SetBetaPows(beta1_pow_ptr, beta2_pow_ptr,         \
                                        beta1_pow_out_ptr, beta2_pow_out_ptr, \
diff --git a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
index 7aa5783a01b..ef224382cd0 100644
--- a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/lamb_op.h"
 #include "gflags/gflags.h"
+#include "paddle/fluid/operators/optimizers/lamb_op.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index 5b883a11e57..553ac69edca 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -129,8 +129,9 @@ __device__ inline void VectorizeLarsUpdate(
   for (int i = tid + tail_offset; i < numel; i += grid_stride) {
     MT grad_val = static_cast<MT>(grad[i]) * rescale_grad;
     MT param_val = param[i];
-    MT velocity_tmp = Fma(velocity[i], mu, local_lr * Fma(lars_weight_decay,
-                                                          param_val, grad_val));
+    MT velocity_tmp =
+        Fma(velocity[i], mu,
+            local_lr * Fma(lars_weight_decay, param_val, grad_val));
     MT param_tmp = param_val - velocity_tmp;
     param_out[i] = static_cast<T>(param_tmp);
     velocity_out[i] = velocity_tmp;
@@ -314,10 +315,10 @@ inline void SeparatedLarsMomentumOpCUDAKernel(
     const MT rescale_grad, const int64_t numel, const MT* master_param_data,
     MT* master_out_data, const bool is_amp) {
   LarsThreadConfig<T> lars_thread_config(numel);
-  L2NormKernel<T, MT><<<lars_thread_config.grid_for_norm, LARS_BLOCK_SIZE, 0,
-                        cuda_ctx.stream()>>>(
-      param_data, grad_data, p_buffer, g_buffer, numel,
-      lars_thread_config.repeat_times, rescale_grad);
+  L2NormKernel<T, MT>
+      <<<lars_thread_config.grid_for_norm, LARS_BLOCK_SIZE, 0,
+         cuda_ctx.stream()>>>(param_data, grad_data, p_buffer, g_buffer, numel,
+                              lars_thread_config.repeat_times, rescale_grad);
 
   MomentumLarsKernel<T, MT><<<lars_thread_config.grid_for_lars, LARS_BLOCK_SIZE,
                               0, cuda_ctx.stream()>>>(
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
index 54ead6d3df7..280c0930e91 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
index 5fad5eca9af..d405500d607 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
-
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
 
@@ -151,10 +150,11 @@ class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {
       framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
       framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
       // NOTE: ApplyMomentum will change the input
-      const auto& runner = NpuOpRunner(
-          "ApplyMomentum", {*param_out, *velocity_out, *learning_rate,
-                            regularized_grad, mu_tensor},
-          {*param_out}, {{"use_nesterov", use_nesterov}});
+      const auto& runner =
+          NpuOpRunner("ApplyMomentum",
+                      {*param_out, *velocity_out, *learning_rate,
+                       regularized_grad, mu_tensor},
+                      {*param_out}, {{"use_nesterov", use_nesterov}});
       runner.Run(dev_ctx.stream());
     }
   }
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc
index 50d2c946f3a..94fb4c156ef 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/optimizers/momentum_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -109,28 +110,26 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::MomentumOpInferVarType);
 
-REGISTER_OP_VERSION(momentum)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(momentum).AddCheckpoint(
+    R"ROC(
       Upgrade momentum add 4 attributes [regularization_method, regularization_coeff,
       multi_precision, rescale_grad].
     )ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewInput("MasterParam", "FP32 master weight for AMP.")
-            .NewOutput("MasterParamOut",
-                       "The updated FP32 master weight for AMP. "
-                       "It shared memory with Input(MasterParam).")
-            .NewAttr("regularization_method",
-                     "(string) regularization_method, right now only support "
-                     "l2decay or none",
-                     std::string(""))
-            .NewAttr("regularization_coeff", "(float) regularization_coeff",
-                     0.0f)
-            .NewAttr(
-                "multi_precision",
-                "(bool) Whether to use multi-precision during weight updating.",
-                false)
-            .NewAttr("rescale_grad",
-                     "(float) Multiply the gradient with `rescale_grad`"
-                     "before updating. Often choose to be `1.0/batch_size`.",
-                     1.0f));
+    paddle::framework::compatible::OpVersionDesc()
+        .NewInput("MasterParam", "FP32 master weight for AMP.")
+        .NewOutput("MasterParamOut",
+                   "The updated FP32 master weight for AMP. "
+                   "It shared memory with Input(MasterParam).")
+        .NewAttr("regularization_method",
+                 "(string) regularization_method, right now only support "
+                 "l2decay or none",
+                 std::string(""))
+        .NewAttr("regularization_coeff", "(float) regularization_coeff", 0.0f)
+        .NewAttr(
+            "multi_precision",
+            "(bool) Whether to use multi-precision during weight updating.",
+            false)
+        .NewAttr("rescale_grad",
+                 "(float) Multiply the gradient with `rescale_grad`"
+                 "before updating. Often choose to be `1.0/batch_size`.",
+                 1.0f));
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 017f33d7458..2f6a9758a2c 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
diff --git a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
index b8fa81b2e71..417f89410cf 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/optimizers/momentum_op.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
 
 namespace paddle {
@@ -77,8 +77,9 @@ class MLUMomentumOpKernel : public framework::OpKernel<T> {
                              GetBasePtr(learning_rate), GetBasePtr(&mu_tensor),
                              GetBasePtr(param_out), GetBasePtr(velocity_out));
     } else if (grad_var->IsType<phi::SelectedRows>()) {
-      PADDLE_ENFORCE_EQ(false, true, platform::errors::PermissionDenied(
-                                         "Unsupport SparseMomentum"));
+      PADDLE_ENFORCE_EQ(
+          false, true,
+          platform::errors::PermissionDenied("Unsupport SparseMomentum"));
     } else {
       PADDLE_ENFORCE_EQ(false, true,
                         platform::errors::PermissionDenied(
diff --git a/paddle/fluid/operators/optimizers/momentum_op_npu.cc b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
index 2d73766b973..d3ffeb18be7 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/optimizers/momentum_op.h"
-
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
@@ -70,14 +69,16 @@ class NPUMomentumOpKernel : public framework::OpKernel<T> {
       framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
       framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
       // NOTE: ApplyMomentum will change the input
-      const auto& runner = NpuOpRunner(
-          "ApplyMomentum", {*param_out, *velocity_out, *learning_rate,
-                            regularized_grad, mu_tensor},
-          {*param_out}, {{"use_nesterov", use_nesterov}});
+      const auto& runner =
+          NpuOpRunner("ApplyMomentum",
+                      {*param_out, *velocity_out, *learning_rate,
+                       regularized_grad, mu_tensor},
+                      {*param_out}, {{"use_nesterov", use_nesterov}});
       runner.Run(dev_ctx.stream());
     } else if (grad_var->IsType<phi::SelectedRows>()) {
-      PADDLE_ENFORCE_EQ(false, true, platform::errors::PermissionDenied(
-                                         "Unsupport SparseMomentum"));
+      PADDLE_ENFORCE_EQ(
+          false, true,
+          platform::errors::PermissionDenied("Unsupport SparseMomentum"));
     } else {
       PADDLE_ENFORCE_EQ(false, true,
                         platform::errors::PermissionDenied(
diff --git a/paddle/fluid/operators/optimizers/momentum_op_xpu.cc b/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
index 6897213c91a..749d38f315e 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_xpu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <string>
+
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/multi_tensor_apply.h b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
index 179e8f45254..98850aa816b 100644
--- a/paddle/fluid/operators/optimizers/multi_tensor_apply.h
+++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <cstdint>
+
 #include "math.h"  // NOLINT
 
 namespace paddle {
@@ -108,11 +109,11 @@ class MultiTensorLauncher {
         stream_(stream) {}
 
   template <typename Functor, typename... Args>
-  void Launch(Functor &&functor, Args &&... args) const {
-    MultiTensorApplyCUDAKernel<
-        Functor, MaxTensorNumPerLaunch,
-        MaxChunkNumPerLaunch><<<chunk_id_, block_dim_, 0, stream_>>>(
-        functor, meta_, chunk_size_, args...);
+  void Launch(Functor &&functor, Args &&...args) const {
+    MultiTensorApplyCUDAKernel<Functor, MaxTensorNumPerLaunch,
+                               MaxChunkNumPerLaunch>
+        <<<chunk_id_, block_dim_, 0, stream_>>>(functor, meta_, chunk_size_,
+                                                args...);
   }
 
  private:
@@ -189,7 +190,7 @@ template <typename Functor, int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
           typename... Args>
 static void MultiTensorApply(Functor functor, gpuStream_t stream,
                              const int *offsets, int n, int chunk_size,
-                             int block_dim, Args &&... args) {
+                             int block_dim, Args &&...args) {
   auto callback = [&](const MultiTensorLauncher<MaxTensorNumPerLaunch,
                                                 MaxChunkNumPerLaunch> &launcher,
                       int i) { launcher.Launch(functor, args...); };
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
index 6893e5d6b9b..5eeeb735307 100644
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
 
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h
index 74cf7627450..353d8777a84 100644
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h
@@ -47,9 +47,8 @@ struct Pow2DecayWithLinearWarmupFunctor {
       auto new_lr = static_cast<double>(step) / warmup_steps_ * base_lr_;
       *lr_ = static_cast<T>(new_lr);
     } else if (step < total_steps_) {
-      auto factor = 1 -
-                    static_cast<double>(step - warmup_steps_) /
-                        (total_steps_ - warmup_steps_);
+      auto factor = 1 - static_cast<double>(step - warmup_steps_) /
+                            (total_steps_ - warmup_steps_);
       auto new_lr =
           static_cast<double>(base_lr_ - end_lr_) * (factor * factor) + end_lr_;
       *lr_ = static_cast<T>(new_lr);
@@ -76,9 +75,10 @@ class Pow2DecayWithLinearWarmupOpKernel : public framework::OpKernel<T> {
     auto *lr_out = ctx.Output<framework::Tensor>("LearningRateOut");
     auto *step_out = ctx.Output<framework::Tensor>("StepOut");
     PADDLE_ENFORCE_EQ(
-        lr, lr_out, platform::errors::InvalidArgument("Input(LearningRate) and "
-                                                      "Output(LearningRateOut) "
-                                                      "must be the same."));
+        lr, lr_out,
+        platform::errors::InvalidArgument("Input(LearningRate) and "
+                                          "Output(LearningRateOut) "
+                                          "must be the same."));
     PADDLE_ENFORCE_NOT_NULL(lr,
                             platform::errors::InvalidArgument(
                                 "Input(LearingRate) should not be nullptr."));
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc
index b3458724482..874e21cc6cc 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
index b53d51686cf..7f4810ea420 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include <gflags/gflags.h>
+
 #include <iostream>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index a2af131cb50..b5822fd5c44 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <string>
-
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
+
+#include <string>
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -76,10 +76,11 @@ class SGDOpInferVarType : public framework::VarTypeInference {
     auto in_var_type = ctx->GetInputType("Param");
     PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
                           in_var_type == framework::proto::VarType::LOD_TENSOR,
-                      true, platform::errors::InvalidArgument(
-                                "The input Var's type should be LoDtensor or "
-                                "SelectedRows, but the received type is %s",
-                                in_var_type));
+                      true,
+                      platform::errors::InvalidArgument(
+                          "The input Var's type should be LoDtensor or "
+                          "SelectedRows, but the received type is %s",
+                          in_var_type));
 
     ctx->SetOutputType("ParamOut", in_var_type, framework::ALL_ELEMENTS);
   }
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index 222244a2fd1..ba2e84a6a78 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -106,11 +107,11 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
       int block = 512;
       int grid = (param->numel() + block - 1) / block;
 
-      SGDKernelMT<
-          T, MPDType><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
-          param->data<T>(), grad->data<T>(), learning_rate->data<T>(),
-          param->numel(), param_out->mutable_data<T>(ctx.GetPlace()),
-          master_in_data, master_out_data);
+      SGDKernelMT<T, MPDType>
+          <<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+              param->data<T>(), grad->data<T>(), learning_rate->data<T>(),
+              param->numel(), param_out->mutable_data<T>(ctx.GetPlace()),
+              master_in_data, master_out_data);
 
     } else if (grad_var->IsType<phi::SelectedRows>()) {
       // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
diff --git a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
index e7c03be95ca..7203357db10 100644
--- a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include <string>
+
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
index c38545df173..0c4fa916f43 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/optimizers/sparse_momentum_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
index 08b2d3764fe..296a3d5b889 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc
index c7c8ebf562b..21254521fa9 100644
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -124,11 +125,10 @@ REGISTER_OPERATOR(p_norm, ops::PnormOp, ops::PnormOpMaker,
                   PNormInferShapeFunctor);
 REGISTER_OPERATOR(p_norm_grad, ops::PnormOpGrad, PNormGradInferShapeFunctor);
 
-REGISTER_OP_VERSION(p_norm)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(p_norm).AddCheckpoint(
+    R"ROC(
         Upgrade p_norm, add 1 attribute [asvector].
       )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "asvector",
-            "Compute as vector when axis is None and input is matrix", false));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "asvector", "Compute as vector when axis is None and input is matrix",
+        false));
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 38fa3316a6e..6d27433512e 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index 80931fea90f..b7f9977f3ed 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
index e4952a24326..b7a638d7ce9 100644
--- a/paddle/fluid/operators/pad3d_op.cc
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 087b8ecba6e..61a2120e1e4 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/pad_constant_like_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h
index 0aedd800e1a..cc7c39d12cd 100644
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ b/paddle/fluid/operators/pad_constant_like_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index dc162ae5782..eaf343dde0f 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/complex.h"
diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc
index fedadc7581e..e0e6ec31e41 100644
--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/partial_concat_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -93,8 +94,9 @@ class PartialConcatOp : public framework::OperatorWithKernel {
         break;
       }
     }
-    PADDLE_ENFORCE_EQ(flag, 1, platform::errors::InvalidArgument(
-                                   "All Inputs of PartialSum OP are Empty!"));
+    PADDLE_ENFORCE_EQ(flag, 1,
+                      platform::errors::InvalidArgument(
+                          "All Inputs of PartialSum OP are Empty!"));
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu
index 322e84ae8b9..d36a7303715 100644
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/partial_concat_op.h"
diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h
index 20a6639e233..b12cb0a0293 100644
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index 72630998d43..a3ce78054ac 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/partial_sum_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -96,8 +97,9 @@ class PartialSumOp : public framework::OperatorWithKernel {
       }
     }
 
-    PADDLE_ENFORCE_EQ(flag, 1, platform::errors::InvalidArgument(
-                                   "All Inputs of PartialSum OP are Empty!"));
+    PADDLE_ENFORCE_EQ(flag, 1,
+                      platform::errors::InvalidArgument(
+                          "All Inputs of PartialSum OP are Empty!"));
     return framework::OpKernelType(input_data_type, platform::CPUPlace());
   }
 };
diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu
index 63d140d6769..b363483fe69 100644
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/partial_sum_op.h"
diff --git a/paddle/fluid/operators/partial_sum_op.h b/paddle/fluid/operators/partial_sum_op.h
index d9c6fd758f4..21c16ed2f62 100644
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index 1724aedbe9b..026a1749c39 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc
index d5896c41059..b964d8fe116 100644
--- a/paddle/fluid/operators/poisson_op.cc
+++ b/paddle/fluid/operators/poisson_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 44f3d8090e5..30ead84d1a9 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -15,13 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/pool_op.h"
 
 #include <unordered_map>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/infermeta/unary.h"
-
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc
index f178a966e1e..d2ec4089f9d 100644
--- a/paddle/fluid/operators/pool_op_xpu.cc
+++ b/paddle/fluid/operators/pool_op_xpu.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <unordered_map>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 
@@ -112,11 +113,12 @@ class PoolGradXPUKernel : public framework::OpKernel<T> {
     bool exclusive = context.Attr<bool>("exclusive");
     bool adaptive = context.Attr<bool>("adaptive");
     const int* index_data = nullptr;
-    PADDLE_ENFORCE_EQ(ksize.size(), 2, platform::errors::InvalidArgument(
-                                           "The Pool2d XPU OP only support 2 "
-                                           "dimension pooling!, but received "
-                                           "%d-dimension pool kernel size",
-                                           ksize.size()));
+    PADDLE_ENFORCE_EQ(
+        ksize.size(), 2,
+        platform::errors::InvalidArgument("The Pool2d XPU OP only support 2 "
+                                          "dimension pooling!, but received "
+                                          "%d-dimension pool kernel size",
+                                          ksize.size()));
     PADDLE_ENFORCE_EQ(!adaptive || (ksize[0] * ksize[1] == 1), true,
                       platform::errors::InvalidArgument(
                           "The Pool2d XPU OP does not support (adaptive == "
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index e0341f4a4b4..8619cc28d50 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index cbe58644f53..02273b7943a 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/positive_negative_pair_op.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -41,11 +42,12 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
           ctx->HasInput("AccumulatePositivePair") &&
               ctx->HasInput("AccumulateNegativePair") &&
               ctx->HasInput("AccumulateNeutralPair"),
-          true, platform::errors::InvalidArgument(
-                    "All optional inputs(AccumulatePositivePair, "
-                    "AccumulateNegativePair, AccumulateNeutralPair) of "
-                    "PositiveNegativePairOp are required if one of them "
-                    "is specified."));
+          true,
+          platform::errors::InvalidArgument(
+              "All optional inputs(AccumulatePositivePair, "
+              "AccumulateNegativePair, AccumulateNeutralPair) of "
+              "PositiveNegativePairOp are required if one of them "
+              "is specified."));
       PADDLE_ENFORCE_EQ(
           ctx->GetInputDim("AccumulatePositivePair"), scalar_dim,
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
index a47deb18b6f..972258350bf 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index de35f674058..50dc9d6429a 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/operators/prim_ops/prim_op_test.cc b/paddle/fluid/operators/prim_ops/prim_op_test.cc
index 2d65149d130..e5b84d00f1f 100644
--- a/paddle/fluid/operators/prim_ops/prim_op_test.cc
+++ b/paddle/fluid/operators/prim_ops/prim_op_test.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index 4dd4114d378..16d6185e87e 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -178,10 +178,8 @@ REGISTER_OPERATOR(print, ops::PrintOp, ops::PrintOpProtoAndCheckMaker,
                   ops::PrintOpGradientMaker<paddle::imperative::OpBase>,
                   ops::PrintOpInferShape, ops::PrintOpVarTypeInference);
 
-REGISTER_OP_VERSION(print)
-    .AddCheckpoint(
-        R"ROC(Upgrade print add a new attribute [print_tensor_layout] to "
+REGISTER_OP_VERSION(print).AddCheckpoint(
+    R"ROC(Upgrade print add a new attribute [print_tensor_layout] to "
              "contorl whether to print tensor's layout.)ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "print_tensor_layout", "Whether to print the tensor's layout.",
-            true));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "print_tensor_layout", "Whether to print the tensor's layout.", true));
diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc
index f03a392bfc7..51bd079849a 100644
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/prroi_pool_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index 0fdccc729ad..8431d945749 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
index 6a2ed6592e7..2e729f94dc8 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
@@ -79,9 +79,10 @@ class PruneGateByCapacityFunctor {
     int blocks = NumBlocks(batch_size);
     int threads = kNumCUDAThreads;
 
-    prune_gate_by_capacity_kernel<T1,
-                                  T2><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        gate_idx_data, new_gate_idx_data_, expert_count_out_data, batch_size);
+    prune_gate_by_capacity_kernel<T1, T2>
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(
+            gate_idx_data, new_gate_idx_data_, expert_count_out_data,
+            batch_size);
   }
 
  private:
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
index f101e509d93..d09b1c7aa06 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
@@ -9,11 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h"
+
 #include <algorithm>
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
index c2717c19b2d..c9390aa42a6 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
@@ -13,6 +13,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
index 9868a625792..701b6250445 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
@@ -9,11 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/pscore/distributed_push_sparse_op.h"
+
 #include <algorithm>
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/pscore/distributed_push_sparse_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
index 6d3faae6a2d..7c361dfd1a7 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
@@ -13,6 +13,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
index 457e37744d3..5d77851b72a 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 PADDLE_DEFINE_EXPORTED_int32(rpc_send_thread_num, 12,
@@ -92,8 +93,9 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const {
     auto blkid = block_list[i];
     auto it = message_to_block_id.find_value(blkid);
     heter_server_->RegisterServiceHandler(
-        it->first, [&](const MultiVarMsg *request, MultiVarMsg *response,
-                       brpc::Controller *cntl) -> int {
+        it->first,
+        [&](const MultiVarMsg *request, MultiVarMsg *response,
+            brpc::Controller *cntl) -> int {
           return send_and_recv_variable_handler_->Handle(request, response,
                                                          cntl);
         });
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
old mode 100755
new mode 100644
index 3ecff083b00..29cc041d682
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include <atomic>
 #include <memory>
 #include <set>
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
index ab2fcba5106..da57660a74d 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include <stdlib.h>
 #include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
 
 #include <random>
 #include <sstream>
+#include <string>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc
index d4ee00d10a5..db647dfaf23 100644
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
@@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/distributed/ps/service/heter_server.h"
+
 #include <stdlib.h>
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
 
+#include <memory>
 #include <random>
 #include <sstream>
+#include <string>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
-#include "paddle/fluid/distributed/ps/service/heter_server.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace framework = paddle::framework;
@@ -181,13 +182,15 @@ void StartSendAndRecvServer(std::string endpoint) {
   heter_server_ptr_->SetEndPoint(endpoint);
   LOG(INFO) << "before HeterServer::RegisterServiceHandler";
   heter_server_ptr_->RegisterServiceHandler(
-      in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response,
-                       brpc::Controller* cntl) -> int {
+      in_var_name,
+      [&](const MultiVarMsg* request, MultiVarMsg* response,
+          brpc::Controller* cntl) -> int {
         return b_req_handler->Handle(request, response, cntl);
       });
   heter_server_ptr_->RegisterServiceHandler(
-      in_var_name2, [&](const MultiVarMsg* request, MultiVarMsg* response,
-                        brpc::Controller* cntl) -> int {
+      in_var_name2,
+      [&](const MultiVarMsg* request, MultiVarMsg* response,
+          brpc::Controller* cntl) -> int {
         return b_req_handler->Handle(request, response, cntl);
       });
 
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
old mode 100755
new mode 100644
index 7c25d38d1eb..a21d11ee1b1
--- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
@@ -14,12 +14,13 @@ limitations under the License. */
 
 #if defined PADDLE_WITH_PSCORE
 #include <stdlib.h>
+
 #include <memory>
+#include <random>
+#include <sstream>
 #include <string>
 #include <thread>  // NOLINT
 
-#include <random>
-#include <sstream>
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
 #include "paddle/fluid/distributed/ps/service/heter_server.h"
@@ -158,8 +159,9 @@ void StartSendAndRecvServer(std::string endpoint) {
   b_rpc_service->SetEndPoint(endpoint);
   LOG(INFO) << "before HeterServer::RegisterServiceHandler";
   b_rpc_service->RegisterServiceHandler(
-      in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response,
-                       brpc::Controller* cntl) -> int {
+      in_var_name,
+      [&](const MultiVarMsg* request, MultiVarMsg* response,
+          brpc::Controller* cntl) -> int {
         return b_req_handler->Handle(request, response, cntl);
       });
 
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
index 4054846460b..c8e24c77734 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -15,12 +15,12 @@ limitations under the License. */
 #if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSCORE)
 
 #include <stdlib.h>
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
 
+#include <memory>
 #include <random>
 #include <sstream>
+#include <string>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
@@ -178,8 +178,9 @@ void StartSendAndRecvServer(std::string endpoint) {
   b_rpc_service2->SetEndPoint(endpoint);
   LOG(INFO) << "before HeterServer::RegisterServiceHandler";
   b_rpc_service2->RegisterServiceHandler(
-      in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response,
-                       brpc::Controller* cntl) -> int {
+      in_var_name,
+      [&](const MultiVarMsg* request, MultiVarMsg* response,
+          brpc::Controller* cntl) -> int {
         return b_req_handler->Handle(request, response, cntl);
       });
 
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h
index 559c7eed84e..f803b57b187 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/box_wrapper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h
index abfdb62ec34..58e11725521 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.h
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/pull_sparse_op.cc b/paddle/fluid/operators/pull_sparse_op.cc
index fb83746de19..57d361b7a77 100644
--- a/paddle/fluid/operators/pull_sparse_op.cc
+++ b/paddle/fluid/operators/pull_sparse_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/pull_sparse_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pull_sparse_op.h b/paddle/fluid/operators/pull_sparse_op.h
index 2498adc141c..e3f0f88ce55 100644
--- a/paddle/fluid/operators/pull_sparse_op.h
+++ b/paddle/fluid/operators/pull_sparse_op.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/pull_sparse_v2_op.cc b/paddle/fluid/operators/pull_sparse_v2_op.cc
index f5f2e728e38..a8fc84b9c2b 100644
--- a/paddle/fluid/operators/pull_sparse_v2_op.cc
+++ b/paddle/fluid/operators/pull_sparse_v2_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/pull_sparse_v2_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pull_sparse_v2_op.h b/paddle/fluid/operators/pull_sparse_v2_op.h
index 29337cc2d94..c24d0a4f338 100644
--- a/paddle/fluid/operators/pull_sparse_v2_op.h
+++ b/paddle/fluid/operators/pull_sparse_v2_op.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/operators/push_dense_op.cc b/paddle/fluid/operators/push_dense_op.cc
index 5b9f05bd126..5284a1a61e5 100644
--- a/paddle/fluid/operators/push_dense_op.cc
+++ b/paddle/fluid/operators/push_dense_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/push_dense_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/push_dense_op.h b/paddle/fluid/operators/push_dense_op.h
index 592ef5ff72a..c8f98a1ea9e 100644
--- a/paddle/fluid/operators/push_dense_op.h
+++ b/paddle/fluid/operators/push_dense_op.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index f676348bc0a..de46357e497 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -20,6 +20,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
index 14c9e8b0c26..db8f315366a 100644
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <vector>
-
 #include "paddle/fluid/operators/py_layer_op.h"
 
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/py_layer_op.h b/paddle/fluid/operators/py_layer_op.h
index 6625a4a1a75..ea048ee9e59 100644
--- a/paddle/fluid/operators/py_layer_op.h
+++ b/paddle/fluid/operators/py_layer_op.h
@@ -18,6 +18,7 @@
 #include <functional>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/python_headers.h"
 
diff --git a/paddle/fluid/operators/pyramid_hash_op.cc b/paddle/fluid/operators/pyramid_hash_op.cc
index 4b0ade99154..6650037e4d2 100644
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <xxhash.h>
+
 #include <algorithm>
 #include <cmath>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/search_compute.h"
@@ -216,9 +218,8 @@ class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
   bool should_use_term(math::bloomfilter* _filter,
                        math::bloomfilter* _black_filter, const float* word_repr,
                        int len) const {
-    return (!_filter ||
-            1 == math::bloomfilter_get(_filter, word_repr,
-                                       len * sizeof(float))) &&
+    return (!_filter || 1 == math::bloomfilter_get(_filter, word_repr,
+                                                   len * sizeof(float))) &&
            (!_black_filter ||
             0 == math::bloomfilter_get(_black_filter, word_repr,
                                        len * sizeof(float)));
diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc
index 02d5e5f03f0..55cab539c4d 100644
--- a/paddle/fluid/operators/qr_op.cc
+++ b/paddle/fluid/operators/qr_op.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/qr_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/phi/core/ddim.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu
index a57a8d5cf8b..695b90e9319 100644
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
@@ -16,8 +16,10 @@ limitations under the License. */
 // HIP not support cusolver
 
 #include <thrust/device_vector.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/qr_op.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
@@ -43,8 +45,9 @@ class QrGPUKernel : public framework::OpKernel<T> {
     std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
 
     auto numel = x.numel();
-    PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
-                                    "The input of QR is empty."));
+    PADDLE_ENFORCE_GT(
+        numel, 0,
+        platform::errors::PreconditionNotMet("The input of QR is empty."));
     auto x_dims = x.dims();
     int x_rank = x_dims.size();
     int m = x_dims[x_rank - 2];
diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index 5ef02d89427..760b2efd21f 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -16,6 +16,7 @@
 
 #include <Eigen/Dense>
 #include <cstdarg>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/svd_helper.h"
@@ -89,11 +90,11 @@ class QrGradKernel : public framework::OpKernel<T> {
     }
 
     // m >= n case
-    auto m_gt_n_case = [](
-        const framework::ExecutionContext& ctx,
-        math::DeviceIndependenceTensorOperations<DeviceContext, T>& dito,
-        const Tensor& dQ, const Tensor& dR, const Tensor& A, const Tensor& Q,
-        const Tensor& R) -> framework::Tensor {
+    auto m_gt_n_case =
+        [](const framework::ExecutionContext& ctx,
+           math::DeviceIndependenceTensorOperations<DeviceContext, T>& dito,
+           const Tensor& dQ, const Tensor& dR, const Tensor& A, const Tensor& Q,
+           const Tensor& R) -> framework::Tensor {
       // Hai-Jun Liao, Jin-Guo Liu, Lei Wang, Tao Xiang (2019). Differentiable
       // Programming Tensor Networks.
       // https://arxiv.org/abs/1903.09650 Section 3. QR factorization
diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc
index 4039f0e9d07..edd2a06a500 100644
--- a/paddle/fluid/operators/quantize_linear_op.cc
+++ b/paddle/fluid/operators/quantize_linear_op.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/quantize_linear_op.h"
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/transform.h"
diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu
index 6c7e430f511..6e3e39562c7 100644
--- a/paddle/fluid/operators/quantize_linear_op.cu
+++ b/paddle/fluid/operators/quantize_linear_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/fake_dequantize_op.cu.h"
 #include "paddle/fluid/operators/fake_quantize_op.cu.h"
@@ -46,10 +47,10 @@ struct ChannelDequantizeFunctorV2<platform::CUDADeviceContext, T> {
       quant_stride *= in_dims[i];
     }
 
-    DequantizeOneScaleQuantAxisN<
-        T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-        in_data, scale_factor, max_range, num, in_dims[quant_axis],
-        quant_stride, out_data);
+    DequantizeOneScaleQuantAxisN<T>
+        <<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+            in_data, scale_factor, max_range, num, in_dims[quant_axis],
+            quant_stride, out_data);
   }
 };
 
diff --git a/paddle/fluid/operators/quantize_linear_op.h b/paddle/fluid/operators/quantize_linear_op.h
index e20b99e85f0..df1a93ba638 100644
--- a/paddle/fluid/operators/quantize_linear_op.h
+++ b/paddle/fluid/operators/quantize_linear_op.h
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/malloc.h"
diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc
index 951951253c4..62ec77bc224 100644
--- a/paddle/fluid/operators/quantize_op.cc
+++ b/paddle/fluid/operators/quantize_op.cc
@@ -13,6 +13,7 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/quantize_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -57,13 +58,13 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(quantize, ops::QuantOp, ops::QuantOpMaker);
 
 REGISTER_OP_VERSION(quantize)
-    .AddCheckpoint(
-        R"ROC( Add a new attribute [bfloat16])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "bfloat16", "If true, float32 input is converted to bfloat16",
-            false))
-    .AddCheckpoint(
-        R"ROC( Add a new attribute [Shift])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "Shift", "Quantize data to uint8 if provided non-zero value.",
-            0.0f));
+    .AddCheckpoint(R"ROC( Add a new attribute [bfloat16])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewAttr(
+                       "bfloat16",
+                       "If true, float32 input is converted to bfloat16",
+                       false))
+    .AddCheckpoint(R"ROC( Add a new attribute [Shift])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewAttr(
+                       "Shift",
+                       "Quantize data to uint8 if provided non-zero value.",
+                       0.0f));
diff --git a/paddle/fluid/operators/quantize_op.h b/paddle/fluid/operators/quantize_op.h
index 091306e4637..dd1b3c42fb5 100644
--- a/paddle/fluid/operators/quantize_op.h
+++ b/paddle/fluid/operators/quantize_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/queue_generator_op.cc b/paddle/fluid/operators/queue_generator_op.cc
index e2174b9346e..3683fbd075d 100644
--- a/paddle/fluid/operators/queue_generator_op.cc
+++ b/paddle/fluid/operators/queue_generator_op.cc
@@ -43,9 +43,10 @@ class QueueGeneratorOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
     std::vector<std::string> names = Attr<std::vector<std::string>>("names");
-    PADDLE_ENFORCE_GT(names.size(), 0, platform::errors::InvalidArgument(
-                                           "The attribute 'names' for "
-                                           "Op(queue_generator) must be set."));
+    PADDLE_ENFORCE_GT(
+        names.size(), 0,
+        platform::errors::InvalidArgument("The attribute 'names' for "
+                                          "Op(queue_generator) must be set."));
 
     int capacity = Attr<int>("capacity");
     PADDLE_ENFORCE_GT(capacity, 0,
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index 2928c3b5027..cfda710bd77 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
diff --git a/paddle/fluid/operators/random_routing_op.cu b/paddle/fluid/operators/random_routing_op.cu
index fec65518a9d..471cfb40e61 100644
--- a/paddle/fluid/operators/random_routing_op.cu
+++ b/paddle/fluid/operators/random_routing_op.cu
@@ -71,9 +71,9 @@ class RandomRoutingOpCUDAKernel : public framework::OpKernel<T> {
     auto topk_idx_data = topk_idx->data<int64_t>();
     auto out_data = out->data<int64_t>();
 
-    random_routing_kernel<
-        T><<<GET_BLOCKS(num_idx), CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
-        out_data, num_idx, N, D, prob_data, topk_idx_data, topk_value_data);
+    random_routing_kernel<T>
+        <<<GET_BLOCKS(num_idx), CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
+            out_data, num_idx, N, D, prob_data, topk_idx_data, topk_value_data);
   }
 };
 
diff --git a/paddle/fluid/operators/randperm_op.cc b/paddle/fluid/operators/randperm_op.cc
index 1b28ab3c133..aed1f2b0ed1 100644
--- a/paddle/fluid/operators/randperm_op.cc
+++ b/paddle/fluid/operators/randperm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
@@ -29,10 +30,11 @@ class RandpermOp : public framework::OperatorWithKernel {
                           "The output(Out) of randperm op must not be null."));
     int n = ctx->Attrs().Get<int>("n");
     PADDLE_ENFORCE_GT(
-        n, 0, platform::errors::InvalidArgument(
-                  "The input 'n' of randperm op should be greater than 0. "
-                  "But received %d.",
-                  n));
+        n, 0,
+        platform::errors::InvalidArgument(
+            "The input 'n' of randperm op should be greater than 0. "
+            "But received %d.",
+            n));
 
     ctx->SetOutputDim("Out", phi::make_ddim({n}));
   }
diff --git a/paddle/fluid/operators/randperm_op_npu.cc b/paddle/fluid/operators/randperm_op_npu.cc
index a16c0d905a5..c9f61211016 100644
--- a/paddle/fluid/operators/randperm_op_npu.cc
+++ b/paddle/fluid/operators/randperm_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/randperm_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/randperm_op.h"
 
 template <typename T>
 using kernel =
diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc
index 80fdb2ce6c3..215f8369818 100644
--- a/paddle/fluid/operators/range_op.cc
+++ b/paddle/fluid/operators/range_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/range_op.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
index 8924b23ce5c..e2fd16dd629 100644
--- a/paddle/fluid/operators/range_op.h
+++ b/paddle/fluid/operators/range_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <functional>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -22,13 +23,15 @@ namespace operators {
 
 template <typename T>
 void GetSize(T start, T end, T step, int64_t* size) {
-  PADDLE_ENFORCE_NE(step, 0, platform::errors::InvalidArgument(
-                                 "The step of range op should not be 0."));
+  PADDLE_ENFORCE_NE(step, 0,
+                    platform::errors::InvalidArgument(
+                        "The step of range op should not be 0."));
 
   if (start < end) {
     PADDLE_ENFORCE_GT(
-        step, 0, platform::errors::InvalidArgument(
-                     "The step should be greater than 0 while start < end."));
+        step, 0,
+        platform::errors::InvalidArgument(
+            "The step should be greater than 0 while start < end."));
   }
 
   if (start > end) {
diff --git a/paddle/fluid/operators/range_op_xpu.cc b/paddle/fluid/operators/range_op_xpu.cc
index 6672968de3a..bfc0d27f7ca 100644
--- a/paddle/fluid/operators/range_op_xpu.cc
+++ b/paddle/fluid/operators/range_op_xpu.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/range_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/range_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc
index e5332da6475..89bdeb57b5f 100644
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ b/paddle/fluid/operators/rank_attention_op.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/rank_attention_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu
index 9b3a1e56371..61d723c27f7 100644
--- a/paddle/fluid/operators/rank_attention_op.cu
+++ b/paddle/fluid/operators/rank_attention_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/rank_attention.cu.h"
 #include "paddle/fluid/operators/rank_attention_op.h"
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index f126070a7eb..38c45ca2803 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -161,9 +161,10 @@ class BlockingQueue {
 
  private:
   inline void EnforceNotKilled() {
-    PADDLE_ENFORCE_NE(killed_, true, platform::errors::Fatal(
-                                         "Blocking queue is killed because the "
-                                         "data reader raises an exception."));
+    PADDLE_ENFORCE_NE(
+        killed_, true,
+        platform::errors::Fatal("Blocking queue is killed because the "
+                                "data reader raises an exception."));
   }
 
  private:
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index db0f5758d2f..193f6c29724 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reader/buffered_reader.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
index 86fbddc0ec2..b83d0852841 100644
--- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reader/ctr_reader.h"
-
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 8557ef950b3..8b2809b286c 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -322,9 +322,10 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
   framework::Executor executor(place);
   auto *block = Attr<framework::BlockDesc *>(kStepBlock);
   auto *program = block->Program();
-  auto ctx = executor.Prepare(
-      *program, block->ID(), Attr<std::vector<std::string>>(
-                                 kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/);
+  auto ctx =
+      executor.Prepare(*program, block->ID(),
+                       Attr<std::vector<std::string>>(
+                           kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/);
 
   for (size_t step_id = 0; step_id < seq_len; ++step_id) {
     size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
@@ -387,19 +388,19 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
     //   outside::output[seq_offset: seq_offset + 1] = inside::output
     executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_);
     if (step_id > 0) {
-      LinkTensorWithCallback(scope, Outputs(kInputGrads), cur_scope,
-                             GradVarLists(Inputs(kInputs)),
-                             [&](const framework::LoDTensor &src_tensor,
-                                 framework::LoDTensor *dst_tensor) {
-                               if (src_tensor.memory_size() ==
-                                   0) {  // Inside Gradient is not created.
-                                 return;
-                               }
-                               framework::Tensor src_slice =
-                                   src_tensor.Slice(seq_offset, seq_offset + 1);
-                               dst_tensor->ShareDataWith(src_slice);
-                             },
-                             true /*is_backward*/);
+      LinkTensorWithCallback(
+          scope, Outputs(kInputGrads), cur_scope, GradVarLists(Inputs(kInputs)),
+          [&](const framework::LoDTensor &src_tensor,
+              framework::LoDTensor *dst_tensor) {
+            if (src_tensor.memory_size() ==
+                0) {  // Inside Gradient is not created.
+              return;
+            }
+            framework::Tensor src_slice =
+                src_tensor.Slice(seq_offset, seq_offset + 1);
+            dst_tensor->ShareDataWith(src_slice);
+          },
+          true /*is_backward*/);
     }
 
     VLOG(5) << "Recurrent memory linking finished ";
@@ -604,7 +605,8 @@ if reverse is True
       |          |          |         |
       v          v          v         v
       o          o          o         o
-)DOC").SetDefault(false);
+)DOC")
+        .SetDefault(false);
     AddAttr<bool>(RecurrentBase::kIsTrain, "").SetDefault(true);
     AddAttr<std::vector<std::string>>(RecurrentBase::kSkipEagerDeletionVars,
                                       "Vars that would skip eager deletion."
@@ -663,14 +665,16 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
           ctx->Attrs()
               .Get<std::vector<std::string>>(RecurrentBase::kExStates)
               .size(),
-          0, platform::errors::InvalidArgument("The Attr(%s) should be empty.",
-                                               RecurrentBase::kExStates));
+          0,
+          platform::errors::InvalidArgument("The Attr(%s) should be empty.",
+                                            RecurrentBase::kExStates));
       PADDLE_ENFORCE_EQ(
           ctx->Attrs()
               .Get<std::vector<std::string>>(RecurrentBase::kStates)
               .size(),
-          0, platform::errors::InvalidArgument("The Attr(%s) should be empty.",
-                                               RecurrentBase::kStates));
+          0,
+          platform::errors::InvalidArgument("The Attr(%s) should be empty.",
+                                            RecurrentBase::kStates));
     }
 
     PADDLE_ENFORCE_EQ(
@@ -702,9 +706,10 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
     if (ctx->HasInputs(RecurrentBase::kParameters)) {
       PADDLE_ENFORCE_EQ(
           ctx->HasOutputs(framework::GradVarName(RecurrentBase::kParameters)),
-          true, platform::errors::InvalidArgument(
-                    "The output of(%s) should not be empty.",
-                    framework::GradVarName(RecurrentBase::kParameters)));
+          true,
+          platform::errors::InvalidArgument(
+              "The output of(%s) should not be empty.",
+              framework::GradVarName(RecurrentBase::kParameters)));
       ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kParameters),
                          ctx->GetInputsDim(RecurrentBase::kParameters));
     }
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
index 83a21a919dc..063f7ca041a 100644
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
+++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
index 0602c73db6b..4128d51559c 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
index c5bc66e23ce..29587faa480 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
@@ -16,16 +16,18 @@
 
 REGISTER_REDUCE_OP(reduce_amax);
 REGISTER_OP_CPU_KERNEL(
-    reduce_amax, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                   ops::MaxFunctor>,
+    reduce_amax,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                      ops::MaxFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
                       ops::MaxFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
                       ops::MaxFunctor>);
 REGISTER_OP_CPU_KERNEL(
-    reduce_amax_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                            float, ops::AMaxOrAMinGradFunctor>,
+    reduce_amax_grad,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,
+                          ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
                           ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
index 27f2e2b70c6..18c846bc2b4 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
@@ -15,8 +15,9 @@
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
-    reduce_amax_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::AMaxOrAMinGradFunctor>,
+    reduce_amax_grad,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,
+                          ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                           ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
index 027bf8ea00a..8069e526f1a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
@@ -16,16 +16,18 @@
 
 REGISTER_REDUCE_OP(reduce_amin);
 REGISTER_OP_CPU_KERNEL(
-    reduce_amin, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                   ops::MinFunctor>,
+    reduce_amin,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                      ops::MinFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
                       ops::MinFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
                       ops::MinFunctor>);
 REGISTER_OP_CPU_KERNEL(
-    reduce_amin_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                            float, ops::AMaxOrAMinGradFunctor>,
+    reduce_amin_grad,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,
+                          ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
                           ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
index a296c4c5d6f..c7a26049634 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
@@ -15,8 +15,9 @@
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
-    reduce_amin_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::AMaxOrAMinGradFunctor>,
+    reduce_amin_grad,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,
+                          ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                           ops::AMaxOrAMinGradFunctor>,
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
index 15812778e00..4cc3239ea68 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
index 1c1269a08db..eb62c84fd5a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
@@ -15,6 +15,7 @@
 #ifdef PADDLE_WITH_XPU
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index dc41979defb..8ce115ce669 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
+
 #include <memory>
 #include <string>
 #include <utility>
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
index 715dcb25c20..111537f6455 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
@@ -11,8 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
index 5e5b04d57b0..f6d8aa13182 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index b21e41c5b85..a2048004615 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -21,7 +21,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/tensor.h"
-
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 76641698ead..322ef1fdff6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <set>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/cast_op.h"
@@ -484,8 +485,9 @@ class ReduceOp : public framework::OperatorWithKernel {
           platform::is_gpu_place(ctx.GetPlace()) ||
               platform::is_npu_place(ctx.GetPlace()) ||
               platform::is_mlu_place(ctx.GetPlace()),
-          true, platform::errors::InvalidArgument(
-                    "float16 can only be used on GPU or NPU or MLU place"));
+          true,
+          platform::errors::InvalidArgument(
+              "float16 can only be used on GPU or NPU or MLU place"));
     }
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
index c144e65cbf6..a9d5863558c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
index 95dda354cae..96e496217d0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
@@ -17,6 +17,7 @@
 #ifdef PADDLE_WITH_MLU
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
index 324fd369e82..f9ae575e801 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
@@ -20,6 +20,7 @@
 #include <set>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
index 7a5c86c35c6..f50cfd0417a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
@@ -15,6 +15,7 @@
 #ifdef PADDLE_WITH_XPU
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu
index e40bd147b99..028f5a7f515 100644
--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/renorm_op.h"
-
 #include <algorithm>
 #include <cstdio>
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/operators/renorm_op.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
@@ -107,10 +106,10 @@ __global__ void RenormGradKernelFunc2(const T* x_data, const T* dout_data,
   __syncthreads();
   if (i < size) {
     dx_data[i] = dim_value[dim_index] * dout_data[i];
-    dx_data[i] = dx_data[i] +
-                 weight_derivative[dim_index] * dim_power_sum[dim_index] *
-                     pow(abs(x_data[i]), T(p - 1.0)) *
-                     (x_data[i] >= 0 ? 1 : -1);
+    dx_data[i] = dx_data[i] + weight_derivative[dim_index] *
+                                  dim_power_sum[dim_index] *
+                                  pow(abs(x_data[i]), T(p - 1.0)) *
+                                  (x_data[i] >= 0 ? 1 : -1);
   }
 }
 
diff --git a/paddle/fluid/operators/repeat_interleave_op.cc b/paddle/fluid/operators/repeat_interleave_op.cc
index d6f9df5d79e..daa45bf78f2 100644
--- a/paddle/fluid/operators/repeat_interleave_op.cc
+++ b/paddle/fluid/operators/repeat_interleave_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/repeat_interleave_op.h"
+
 #include <memory>
 
 namespace paddle {
@@ -51,11 +52,12 @@ class RepeatInterleaveOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           repeats_dim.size() == 1 ||
               (repeats_dim.size() == 2 && repeats_dim[1] == 1),
-          true, platform::errors::InvalidArgument(
-                    "The 'shape' of Input(RepeatsTensor) must be 1-D tensor. "
-                    "But received: the 'shape' of Input(Index) is [%s], "
-                    "the dimension of Input(Index) is [%d].",
-                    repeats_dim, repeats_dim.size()));
+          true,
+          platform::errors::InvalidArgument(
+              "The 'shape' of Input(RepeatsTensor) must be 1-D tensor. "
+              "But received: the 'shape' of Input(Index) is [%s], "
+              "the dimension of Input(Index) is [%d].",
+              repeats_dim, repeats_dim.size()));
 
       PADDLE_ENFORCE_EQ(repeats_dim[0] != 0, true,
                         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/repeat_interleave_op.cu b/paddle/fluid/operators/repeat_interleave_op.cu
index 5f48a4a94ac..2b8464d5bf6 100644
--- a/paddle/fluid/operators/repeat_interleave_op.cu
+++ b/paddle/fluid/operators/repeat_interleave_op.cu
@@ -127,10 +127,10 @@ class RepeatInterleaveCUDAKernel : public framework::OpKernel<T> {
         int64_t size = output_dim[dim];
         int64_t delta = input_dim[dim] - size;
 
-        index_select_cuda_kernel<T, int64_t><<<
-            (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-            PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
-                                                  numel, stride, size, delta);
+        index_select_cuda_kernel<T, int64_t>
+            <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+               PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+                in_data, out_data, index_data, numel, stride, size, delta);
       } else {
         RepeatsTensor2IndexTensor<DeviceContext, int>(*repeats_tensor, &index);
 
@@ -143,10 +143,10 @@ class RepeatInterleaveCUDAKernel : public framework::OpKernel<T> {
         int64_t size = output_dim[dim];
         int64_t delta = input_dim[dim] - size;
 
-        index_select_cuda_kernel<T, int><<<
-            (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-            PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
-                                                  numel, stride, size, delta);
+        index_select_cuda_kernel<T, int>
+            <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+               PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+                in_data, out_data, index_data, numel, stride, size, delta);
       }
     } else if (repeats > 0) {
       int64_t index_size = in->dims()[dim] * repeats;
@@ -169,10 +169,10 @@ class RepeatInterleaveCUDAKernel : public framework::OpKernel<T> {
       int64_t delta = input_dim[dim] - size;
 
       const int* index_data = index.data<int>();
-      index_select_cuda_kernel<T, int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                                             PADDLE_CUDA_NUM_THREADS,
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          in_data, out_data, index_data, numel, stride, size, delta);
+      index_select_cuda_kernel<T, int>
+          <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+              in_data, out_data, index_data, numel, stride, size, delta);
       platform::GpuStreamSync(stream);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -206,9 +206,9 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel<T> {
     auto stream =
         context.template device_context<platform::CUDADeviceContext>().stream();
 
-    index_select_grad_init<
-        T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel);
+    index_select_grad_init<T>
+        <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel);
 
     int repeats = context.Attr<int>("Repeats");
     framework::LoDTensor index;
@@ -237,22 +237,24 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel<T> {
         int64_t index_nums = index.numel();
 
         const int64_t* index_data = index.data<int64_t>();
-        index_select_grad_cuda_kernel<T, int64_t><<<
-            (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-            PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-            output_grad_data, in_grad_data, index_data, index_nums, out_nums,
-            stride, size, delta);
+        index_select_grad_cuda_kernel<T, int64_t>
+            <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) /
+                   PADDLE_CUDA_NUM_THREADS,
+               PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+                output_grad_data, in_grad_data, index_data, index_nums,
+                out_nums, stride, size, delta);
         platform::GpuStreamSync(stream);
       } else {
         RepeatsTensor2IndexTensor<DeviceContext, int>(*repeats_tensor, &index);
         int64_t index_nums = index.numel();
 
         const int* index_data = index.data<int>();
-        index_select_grad_cuda_kernel<T, int><<<
-            (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-            PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-            output_grad_data, in_grad_data, index_data, index_nums, out_nums,
-            stride, size, delta);
+        index_select_grad_cuda_kernel<T, int>
+            <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) /
+                   PADDLE_CUDA_NUM_THREADS,
+               PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+                output_grad_data, in_grad_data, index_data, index_nums,
+                out_nums, stride, size, delta);
         platform::GpuStreamSync(stream);
       }
     } else if (repeats > 0) {
@@ -268,11 +270,11 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel<T> {
 
       const int* index_data = index.data<int>();
       int64_t index_nums = index.numel();
-      index_select_grad_cuda_kernel<T, int><<<
-          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums,
-                                                out_nums, stride, size, delta);
+      index_select_grad_cuda_kernel<T, int>
+          <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+              output_grad_data, in_grad_data, index_data, index_nums, out_nums,
+              stride, size, delta);
       platform::GpuStreamSync(stream);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/repeat_interleave_op.h b/paddle/fluid/operators/repeat_interleave_op.h
index 68b66bd534c..f8e39fdc907 100644
--- a/paddle/fluid/operators/repeat_interleave_op.h
+++ b/paddle/fluid/operators/repeat_interleave_op.h
@@ -14,11 +14,11 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/index_select_op.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
-#include "paddle/fluid/operators/index_select_op.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/requantize_op.cc b/paddle/fluid/operators/requantize_op.cc
index 2d87ae91fbe..d9345c1145b 100644
--- a/paddle/fluid/operators/requantize_op.cc
+++ b/paddle/fluid/operators/requantize_op.cc
@@ -13,6 +13,7 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/requantize_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
diff --git a/paddle/fluid/operators/requantize_op.h b/paddle/fluid/operators/requantize_op.h
index c2b154db11d..8166aa98f07 100644
--- a/paddle/fluid/operators/requantize_op.h
+++ b/paddle/fluid/operators/requantize_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/rnn_op.cc b/paddle/fluid/operators/rnn_op.cc
index caf90219935..d3c6ee7c1e1 100644
--- a/paddle/fluid/operators/rnn_op.cc
+++ b/paddle/fluid/operators/rnn_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index bf78b6a6965..db84387e6cf 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index 7be1c190120..18938d71832 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 12e33d56c00..e47145535a3 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 07a6117d711..9c66566fdfd 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -12,9 +12,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/row_conv_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index c5794948aae..b1cabb018b9 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -344,9 +344,9 @@ class RowConvKernel<platform::CUDADeviceContext, T>
       dim3 block_dim = dim3(32, 32);
       dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
       int mem_per_block = (future_context * block_dim.x) * sizeof(T);
-      RowConvForwardSharedMemory<
-          T><<<grid_dim, block_dim, mem_per_block, stream>>>(
-          in, weight, num_sequence, input_dim, future_context, idx, out);
+      RowConvForwardSharedMemory<T>
+          <<<grid_dim, block_dim, mem_per_block, stream>>>(
+              in, weight, num_sequence, input_dim, future_context, idx, out);
     } else {
       dim3 block_dim = dim3(32, 32);
       dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
@@ -413,10 +413,10 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
             (block_y * block_x + block_y * (block_x + future_context - 1) +
              future_context * block_y) *
             sizeof(T);
-        RowConvGradFilterImproved<
-            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
-            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
-            idx, dfilter);
+        RowConvGradFilterImproved<T>
+            <<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+                in, dout, num_sequence, input_dim, future_context, block_x,
+                block_y, idx, dfilter);
       } else {
         dim3 block_dim = dim3(32, 32);
         dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
@@ -424,10 +424,10 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
         int block_y = block_dim.y;
         int mem_per_block =
             (block_x * block_y * 2) * sizeof(T);  // For 2 arrays of size 32x32
-        RowConvGradFilter<
-            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
-            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
-            idx, dfilter);
+        RowConvGradFilter<T>
+            <<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+                in, dout, num_sequence, input_dim, future_context, block_x,
+                block_y, idx, dfilter);
       }
     }
 
@@ -437,9 +437,10 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
         dim3 block_dim = dim3(32, 32);
         dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
         int mem_per_block = (future_context * block_dim.x) * sizeof(T);
-        RowConvGradInputSharedMemory<
-            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
-            dout, weights, num_sequence, input_dim, future_context, idx, din);
+        RowConvGradInputSharedMemory<T>
+            <<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+                dout, weights, num_sequence, input_dim, future_context, idx,
+                din);
       } else {
         dim3 block_dim = dim3(32, 32);
         dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
diff --git a/paddle/fluid/operators/rrelu_op.cc b/paddle/fluid/operators/rrelu_op.cc
index c543a088e9d..558c77b5b92 100644
--- a/paddle/fluid/operators/rrelu_op.cc
+++ b/paddle/fluid/operators/rrelu_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index 8007f0bc37b..bfd33efe833 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -99,11 +99,12 @@ static void CheckOutputVarStatus(const Variable &src_var,
             var_name,
             platform::demangle(framework::ToTypeName(src_var.Type()))));
     PADDLE_ENFORCE_EQ(src_var.Get<phi::SelectedRows>().value().IsInitialized(),
-                      true, platform::errors::InvalidArgument(
-                                "The tensor in output variable %s get from "
-                                "RunProgram(Grad)Op's "
-                                "internal scope is not initialized.",
-                                var_name));
+                      true,
+                      platform::errors::InvalidArgument(
+                          "The tensor in output variable %s get from "
+                          "RunProgram(Grad)Op's "
+                          "internal scope is not initialized.",
+                          var_name));
 
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
@@ -224,7 +225,7 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
 
       framework::PEAndGraphPair pe_and_graph;
       auto callable = [this, is_test, &pe_and_graph](
-          const framework::ExecutionContext &exe_ctx) {
+                          const framework::ExecutionContext &exe_ctx) {
         pe_and_graph = ComputeImpl(exe_ctx, is_test, true);
       };
       inner_graphs[graph_idx] = CaptureCUDAGraph(
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
index e02c7ade9a1..a80d527fd5c 100644
--- a/paddle/fluid/operators/sample_logits_op.cc
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/sample_logits_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/operators/math/sample_prob.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
index 273010e5443..7eff9429244 100644
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -146,9 +147,9 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
     int threads = 512;
     size_t size = batch_size * num_true;
     int grid = (size + threads - 1) / threads;
-    GPUSetLabel<
-        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-        size, num_true, sampled_labels_data);
+    GPUSetLabel<T>
+        <<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+            size, num_true, sampled_labels_data);
 
     if (use_customized_samples) {
       const Tensor* customized_samples =
@@ -190,17 +191,17 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
 
     size = batch_size * num_take;
     grid = (size + threads - 1) / threads;
-    GPUTakeAlongD1<
-        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
-        p_value);
+    GPUTakeAlongD1<T>
+        <<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+            size, batch_size, array_slice_size, idx_slice_size, p_array,
+            p_index, p_value);
 
     if (remove_accidental_hits) {
       const size_t size = batch_size * (num_true + num_samples);
       int grid = (size + threads - 1) / threads;
-      gpu_compute_remove_accidental_hits<
-          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-          size, num_true, idx_slice_size, p_index, p_value);
+      gpu_compute_remove_accidental_hits<T>
+          <<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+              size, num_true, idx_slice_size, p_index, p_value);
     }
 
     // subtracted sampled logits with logQ(y|x)
@@ -246,10 +247,10 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
     const size_t size = batch_size;
     int grid = (size + threads - 1) / threads;
 
-    GPUPutAlongD1<
-        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
-        p_value);
+    GPUPutAlongD1<T>
+        <<<grid, threads, 0, context.cuda_device_context().stream()>>>(
+            size, batch_size, array_slice_size, idx_slice_size, p_array,
+            p_index, p_value);
   }
 };
 
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
index ae741ae3212..815a2897d5d 100644
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 7fe6623dcca..23aa88459ce 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <string>
-
 #include "paddle/fluid/operators/save_combine_op.h"
 
+#include <string>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h
index 8b8e27b79b9..a419e862501 100644
--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include <fstream>
 #include <numeric>
 #include <sstream>
diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
index 493f5081ee4..797321efd6c 100644
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <iostream>
 #include <string>
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/bfloat16.h"
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index d819c172e4a..02774c6b72a 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/save_op.h"
+
 #include <stdint.h>
+
 #include <fstream>
 #include <numeric>
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/operators/save_op.h"
-
 namespace paddle {
 namespace operators {
 class SaveOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h
index e4ca1423afa..64aca1ab6b7 100644
--- a/paddle/fluid/operators/save_op.h
+++ b/paddle/fluid/operators/save_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include <fstream>
 #include <numeric>
 #include <string>
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index cbf2b915207..ebc4c644148 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index 40f5699a29b..fdc98d084ed 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include <string>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/scale_kernel.h"
 
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
index 0ae0e1500c1..0cfc3a77aad 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 5f6b04cf59e..a2e8071e013 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/fluid/operators/scatter_op_xpu.cc b/paddle/fluid/operators/scatter_op_xpu.cc
index 07dd2f2d85f..3ab084b660a 100644
--- a/paddle/fluid/operators/scatter_op_xpu.cc
+++ b/paddle/fluid/operators/scatter_op_xpu.cc
@@ -56,11 +56,12 @@ class ScatterOpXPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         index->dims().size() == 1 ||
             (index->dims().size() == 2 && index->dims()[1] == 1),
-        true, platform::errors::InvalidArgument(
-                  "index's shape is error, "
-                  "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
-                  "but got index'dims shape is %d",
-                  index->dims().size()));
+        true,
+        platform::errors::InvalidArgument(
+            "index's shape is error, "
+            "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
+            "but got index'dims shape is %d",
+            index->dims().size()));
 
     int index_size = static_cast<int>(index->dims()[0]);
     auto x_dims = x->dims();
diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc
index 837ccae0284..7cad6dcab7c 100644
--- a/paddle/fluid/operators/seed_op.cc
+++ b/paddle/fluid/operators/seed_op.cc
@@ -74,13 +74,12 @@ REGISTER_OP_CPU_KERNEL(
     seed, ops::CPUSeedKernel<paddle::platform::CPUDeviceContext, int>);
 
 /* ==========================  register checkpoint ===========================*/
-REGISTER_OP_VERSION(seed)
-    .AddCheckpoint(
-        R"ROC(
+REGISTER_OP_VERSION(seed).AddCheckpoint(
+    R"ROC(
              Upgrade seed add a new attribute [force_cpu])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "force_cpu",
-            "If true, Force fill output variable to cpu."
-            "memory. Otherwise, fill output variable to the running "
-            "device",
-            false));
+    paddle::framework::compatible::OpVersionDesc().NewAttr(
+        "force_cpu",
+        "If true, Force fill output variable to cpu."
+        "memory. Otherwise, fill output variable to the running "
+        "device",
+        false));
diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc
index 9d4c8532a82..92010e8afc0 100644
--- a/paddle/fluid/operators/segment_pool_op.cc
+++ b/paddle/fluid/operators/segment_pool_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
index f6523255e24..0f17ff1e1b7 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
+
 #include <memory>
 #include <vector>
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
index d58a2da29c9..4856e38011b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
index 1b8525febe2..f27e6535d31 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
@@ -16,6 +16,7 @@
 
 #include <utility>
 #include <vector>
+
 #include "boost/optional.hpp"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
@@ -49,7 +50,7 @@ inline framework::LoD ConcatLoD(const Container &xs,
 
 template <typename T, typename... ARGS>
 inline std::vector<std::reference_wrapper<T>> GetDataVectorSafely(
-    const std::vector<T *> &vec, ARGS &&... args) {
+    const std::vector<T *> &vec, ARGS &&...args) {
   std::vector<std::reference_wrapper<T>> result;
   result.reserve(vec.size());
   for (auto *ptr : vec) {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
index 62fa5bc26ac..1935a62621d 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/context_project.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
index 23c6a0133e1..ef440a580f9 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
@@ -54,10 +54,12 @@ class SequenceConvXPUKernel : public framework::OpKernel<T> {
 
     int up_pad = std::max(0, -context_start);
     int down_pad = std::max(0, context_start + context_length - 1);
-    PADDLE_ENFORCE_EQ(up_pad, 2, platform::errors::InvalidArgument(
-                                     "Only support up_pad equal 2."));
-    PADDLE_ENFORCE_EQ(down_pad, 2, platform::errors::InvalidArgument(
-                                       "Only support down_pad equal 2."));
+    PADDLE_ENFORCE_EQ(
+        up_pad, 2,
+        platform::errors::InvalidArgument("Only support up_pad equal 2."));
+    PADDLE_ENFORCE_EQ(
+        down_pad, 2,
+        platform::errors::InvalidArgument("Only support down_pad equal 2."));
 
     auto xpu_context =
         context.template device_context<DeviceContext>().x_context();
@@ -75,8 +77,9 @@ class SequenceConvXPUKernel : public framework::OpKernel<T> {
     // If batch size set to 256, the lod is {0, batch[0] - 0,
     // batch[1] - batch [0], ..., batch[255] - batch[254]},
     // so the lod_size will be 257.
-    PADDLE_ENFORCE_LE(lod_size, 257, platform::errors::InvalidArgument(
-                                         "Only support batch size <= 256."));
+    PADDLE_ENFORCE_LE(
+        lod_size, 257,
+        platform::errors::InvalidArgument("Only support batch size <= 256."));
 
     std::vector<int> cpu_lodx(lod_size);
     for (int i = 0; i < lod_size; i++) {
@@ -155,15 +158,18 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
 
     int up_pad = std::max(0, -context_start);
     int down_pad = std::max(0, context_start + context_length - 1);
-    PADDLE_ENFORCE_EQ(up_pad, 2, platform::errors::InvalidArgument(
-                                     "Only support up_pad equal 2."));
-    PADDLE_ENFORCE_EQ(down_pad, 2, platform::errors::InvalidArgument(
-                                       "Only support down_pad equal 2."));
+    PADDLE_ENFORCE_EQ(
+        up_pad, 2,
+        platform::errors::InvalidArgument("Only support up_pad equal 2."));
+    PADDLE_ENFORCE_EQ(
+        down_pad, 2,
+        platform::errors::InvalidArgument("Only support down_pad equal 2."));
 
     auto lod_level_0 = in->lod()[0];
     int lod_size = lod_level_0.size();
-    PADDLE_ENFORCE_LE(lod_size, 257, platform::errors::InvalidArgument(
-                                         "Only support batch size <= 256."));
+    PADDLE_ENFORCE_LE(
+        lod_size, 257,
+        platform::errors::InvalidArgument("Only support batch size <= 256."));
 
     std::vector<int> cpu_lodx(lod_size);
     for (int i = 0; i < lod_size; i++) {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
index 9591f3e8b5b..0f47e8a9c2a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -14,6 +14,7 @@
 
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+
 #include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
index 79503d9714f..552a8283b36 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
+
 #include <vector>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
index 12d3eee65da..a87c3279224 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+
 #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
index ed98b694b27..8d10ee508a2 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
index 494c8e3ab74..01e9835270c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
index 7e1a06b9eca..5cc4ecdd12a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
index 117fa504ff3..5abe6df09e5 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <numeric>  // std::iota
 #include <sstream>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
index e4f2c1b2b8f..4817b003a28 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
+
 #include <memory>
 
 namespace paddle {
@@ -64,10 +65,11 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           ref_level == -1 ||
               (ref_level >= 0 && ref_level < static_cast<int>(y_lod.size())),
-          true, platform::errors::InvalidArgument(
-                    "Invlid `ref_level`, which should be either equal to -1 "
-                    "or in [0, %d), but received `ref_level` = %u.",
-                    y_lod.size(), ref_level));
+          true,
+          platform::errors::InvalidArgument(
+              "Invlid `ref_level`, which should be either equal to -1 "
+              "or in [0, %d), but received `ref_level` = %u.",
+              y_lod.size(), ref_level));
 
       if (ref_level == -1) ref_level = y_lod.size() - 1;
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
index 7b7bc5183bf..90f911c438b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
index f22b424b307..060a3e7cab3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
index 5d0e1d0194e..7d018e764bd 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
index 3aaa2828d5b..d4022e80d80 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index 01990ebb732..af42285158b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h"
+
 #include <memory>
 #include <string>
 
@@ -30,11 +31,12 @@ class SequencePoolOp : public framework::OperatorWithKernel {
     if (!ctx->IsRuntime()) {
       // Check the lod_level for compile-time.
       auto in_lod_level = ctx->GetLoDLevel("X");
-      PADDLE_ENFORCE_GT(in_lod_level, 0, platform::errors::InvalidArgument(
-                                             "The LoD level of Input(X) should "
-                                             "be larger than 0, but received: "
-                                             "lod level %u.",
-                                             in_lod_level));
+      PADDLE_ENFORCE_GT(
+          in_lod_level, 0,
+          platform::errors::InvalidArgument("The LoD level of Input(X) should "
+                                            "be larger than 0, but received: "
+                                            "lod level %u.",
+                                            in_lod_level));
       ctx->SetLoDLevel("Out", in_lod_level - 1);
     }
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
index 4d981e0187a..96d02e6d2e5 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
@@ -38,9 +39,10 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto lod = in->lod();
     auto lod_level = lod.size();
     // InferShape by lod
-    PADDLE_ENFORCE_GT(lod_level, 0, platform::errors::InvalidArgument(
-                                        "Input(X) Tensor of SequencePoolOp "
-                                        "does not contain LoD information."));
+    PADDLE_ENFORCE_GT(
+        lod_level, 0,
+        platform::errors::InvalidArgument("Input(X) Tensor of SequencePoolOp "
+                                          "does not contain LoD information."));
     PADDLE_ENFORCE_LE(lod_level, 2UL,
                       platform::errors::InvalidArgument(
                           "The lod level of input shall be no more than 2."
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
index 980879db4d0..3a62bc554df 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/sequence_ops/sequence_reshape_op.h"
+
 #include <memory>
+
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
index 90a17d713cf..85282bf23b4 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
index 25c12ab565a..6fa151af4e1 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_scatter_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
index 06fb444740f..fdb24892e09 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index e7585f7ab04..e3f8d16a7ad 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
index c91c59dbfee..0d91832948d 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -134,10 +134,10 @@ struct SequenceSoftmaxFunctor<platform::CUDADeviceContext, T> {
     dim3 block_size(thread_x);
     dim3 grid_size(max_blocks);
     paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
-    sequence_softmax_kernel<
-        T, kThreadsPerBlock><<<grid_size, block_size, 0, context.stream()>>>(
-        x.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height,
-        out->mutable_data<T>(context.GetPlace()));
+    sequence_softmax_kernel<T, kThreadsPerBlock>
+        <<<grid_size, block_size, 0, context.stream()>>>(
+            x.data<T>(), mixv_ref_lod.CUDAData(context.GetPlace()), height,
+            out->mutable_data<T>(context.GetPlace()));
   }
 };
 
@@ -158,11 +158,11 @@ struct SequenceSoftmaxGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid_size(max_blocks);
 
     paddle::framework::MixVector<size_t> mixv_ref_lod(&ref_lod);
-    sequence_softmax_grad_kernel<
-        T, kThreadsPerBlock><<<grid_size, block_size, 0, context.stream()>>>(
-        dout.data<T>(), out.data<T>(),
-        mixv_ref_lod.CUDAData(context.GetPlace()), height,
-        dx->mutable_data<T>(context.GetPlace()));
+    sequence_softmax_grad_kernel<T, kThreadsPerBlock>
+        <<<grid_size, block_size, 0, context.stream()>>>(
+            dout.data<T>(), out.data<T>(),
+            mixv_ref_lod.CUDAData(context.GetPlace()), height,
+            dx->mutable_data<T>(context.GetPlace()));
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
index bacdd7e4ccb..b1d5ec8e9c6 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h"
+
 #include <memory>
 #include <string>
 
@@ -44,8 +45,9 @@ class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel {
     auto topks = attr.Get<std::vector<int>>("topks");
     auto num_k = topks.size();
     PADDLE_ENFORCE_GT(
-        num_k, 0, platform::errors::InvalidArgument(
-                      "Expected topks.size() > 0, but received %zu.", num_k));
+        num_k, 0,
+        platform::errors::InvalidArgument(
+            "Expected topks.size() > 0, but received %zu.", num_k));
 
     auto row_dim = ctx->GetInputDim("ROW");
     auto row_shape_0 = row_dim[0];
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
index 47180f123fa..b5ee43387b3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
index 180d14cfada..636be3b2f6c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
+
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
index d96dc91f3bc..d643ef860c3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 73655bcb185..4adedf09aa3 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -18,7 +18,6 @@
 
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index daa033f9dc6..2231eb212a2 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
-
 #include "paddle/phi/kernels/funcs/slice_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 9001ce5d51d..38482f7b55e 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc
index a62d1b434e7..d4c7d937d4b 100644
--- a/paddle/fluid/operators/shape_op_xpu.cc
+++ b/paddle/fluid/operators/shape_op_xpu.cc
@@ -11,6 +11,7 @@
 
 #ifdef PADDLE_WITH_XPU
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/share_buffer_op.h b/paddle/fluid/operators/share_buffer_op.h
index 1d0abf14f57..1b564c3bef0 100644
--- a/paddle/fluid/operators/share_buffer_op.h
+++ b/paddle/fluid/operators/share_buffer_op.h
@@ -27,8 +27,9 @@ class ShareBufferOpKernel : public framework::OpKernel<T> {
     const auto inputs = ctx.MultiInput<framework::Tensor>("X");
     auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
     size_t n = inputs.size();
-    PADDLE_ENFORCE_EQ(n, outputs.size(), platform::errors::PermissionDenied(
-                                             "Variable number not match."));
+    PADDLE_ENFORCE_EQ(
+        n, outputs.size(),
+        platform::errors::PermissionDenied("Variable number not match."));
     const auto &share_dims_and_dtype =
         ctx.Attr<std::vector<bool>>("share_dims_and_dtype");
     if (!share_dims_and_dtype.empty()) {
diff --git a/paddle/fluid/operators/share_data_op.cc b/paddle/fluid/operators/share_data_op.cc
index 6fcc29e9002..63e8cb648e8 100644
--- a/paddle/fluid/operators/share_data_op.cc
+++ b/paddle/fluid/operators/share_data_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/share_data_op.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -31,8 +32,9 @@ class ShareDataOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         in_type == framework::proto::VarType::LOD_TENSOR ||
             in_type == framework::proto::VarType::SELECTED_ROWS,
-        true, platform::errors::InvalidArgument(
-                  "Type of Variable[X] must be LoDTensor or SelectedRows!"));
+        true,
+        platform::errors::InvalidArgument(
+            "Type of Variable[X] must be LoDTensor or SelectedRows!"));
     PADDLE_ENFORCE_EQ(
         in_type, out_type,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index 1a3666ad823..7388144dda3 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/array_operator.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/phi/core/lod_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/operators/shuffle_batch_op.cc b/paddle/fluid/operators/shuffle_batch_op.cc
index 45f7ab278a3..e338b48a4cc 100644
--- a/paddle/fluid/operators/shuffle_batch_op.cc
+++ b/paddle/fluid/operators/shuffle_batch_op.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/shuffle_batch_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/var_type_inference.h"
 
diff --git a/paddle/fluid/operators/shuffle_batch_op.h b/paddle/fluid/operators/shuffle_batch_op.h
index 2708b4a392d..f56832f9599 100644
--- a/paddle/fluid/operators/shuffle_batch_op.h
+++ b/paddle/fluid/operators/shuffle_batch_op.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
index 70fddc9b047..c43d456e94e 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/shuffle_channel_op.h"
+
 #include <memory>
 #include <string>
 
@@ -61,8 +62,9 @@ class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("group", "the number of groups.")
         .SetDefault(1)
         .AddCustomChecker([](const int& group) {
-          PADDLE_ENFORCE_GE(group, 1, platform::errors::InvalidArgument(
-                                          "group should be larger than 0."));
+          PADDLE_ENFORCE_GE(group, 1,
+                            platform::errors::InvalidArgument(
+                                "group should be larger than 0."));
         });
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
index 582d1ea0f26..d3f6224594b 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -67,10 +67,10 @@ class ShuffleChannelOpCUDAKernel : public framework::OpKernel<T> {
     const T* input_data = input->data<T>();
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
 
-    ShuffleChannel<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        count, feature_map_size, output_data, input_data, group_row,
-        group_column, sp_sz);
+    ShuffleChannel<T>
+        <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            count, feature_map_size, output_data, input_data, group_row,
+            group_column, sp_sz);
   }
 };
 
@@ -103,10 +103,10 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
     int threads = kNumCUDAThreads;
     int count = num * group_column * group_row * sp_sz;
 
-    ShuffleChannel<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        count, feature_map_size, input_grad_data, output_grad_data, group_row,
-        group_column, sp_sz);
+    ShuffleChannel<T>
+        <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            count, feature_map_size, input_grad_data, output_grad_data,
+            group_row, group_column, sp_sz);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h
index aeaac486f3f..409acdfdff7 100644
--- a/paddle/fluid/operators/shuffle_channel_op.h
+++ b/paddle/fluid/operators/shuffle_channel_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 016ff54645b..0cf1296fce6 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/similarity_focus_op.h b/paddle/fluid/operators/similarity_focus_op.h
index 4fa4d772aa3..17ea30277b8 100644
--- a/paddle/fluid/operators/similarity_focus_op.h
+++ b/paddle/fluid/operators/similarity_focus_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cstring>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -67,16 +68,16 @@ class SimilarityFocusKernel : public framework::OpKernel<T> {
 
     std::vector<std::pair<T, int64_t>> array(array_size);
 
-    bool (*cmp)(std::pair<T, int64_t>, std::pair<T, int64_t>) = [](
-        std::pair<T, int64_t> x, std::pair<T, int64_t> y) {
-      return x.first > y.first;
-    };
+    bool (*cmp)(std::pair<T, int64_t>, std::pair<T, int64_t>) =
+        [](std::pair<T, int64_t> x, std::pair<T, int64_t> y) {
+          return x.first > y.first;
+        };
 
-    int64_t (*compute_index)(int64_t*, int, int, int, int) = [](
-        int64_t* dim, int d1, int d2, int d3, int d4) {
-      return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] +
-             d3 * dim[3] + d4;
-    };
+    int64_t (*compute_index)(int64_t*, int, int, int, int) =
+        [](int64_t* dim, int d1, int d2, int d3, int d4) {
+          return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] +
+                 d3 * dim[3] + d4;
+        };
 
     PADDLE_ENFORCE_GT(
         axis, 0,
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index c6432d00e9d..a815e12d061 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/slice_op.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/phi/kernels/funcs/slice_utils.h"
 
 namespace paddle {
@@ -85,8 +87,9 @@ class SliceOp : public framework::OperatorWithKernel {
     }
     if (ctx->HasInputs("EndsTensorList")) {
       ends_size = ctx->Inputs("EndsTensorList").size();
-      PADDLE_ENFORCE_GT(ends_size, 0, platform::errors::InvalidArgument(
-                                          "EndsTensorList size can't be zero"));
+      PADDLE_ENFORCE_GT(ends_size, 0,
+                        platform::errors::InvalidArgument(
+                            "EndsTensorList size can't be zero"));
     }
 
     if (!ctx->HasInput("StartsTensor")) {
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index a9a98b46d5e..f18ffef3f58 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/utils.h"
diff --git a/paddle/fluid/operators/slice_op_mlu.cc b/paddle/fluid/operators/slice_op_mlu.cc
index 43322e4b2e7..7645232ec0c 100644
--- a/paddle/fluid/operators/slice_op_mlu.cc
+++ b/paddle/fluid/operators/slice_op_mlu.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/slice_op.h"
-
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/slice_op.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index 0d0d9ab19df..3441453430e 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/slice_op.h"
-
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"
 
diff --git a/paddle/fluid/operators/slice_op_xpu.cc b/paddle/fluid/operators/slice_op_xpu.cc
index 6ac1027b0ce..8f2dfd38d49 100644
--- a/paddle/fluid/operators/slice_op_xpu.cc
+++ b/paddle/fluid/operators/slice_op_xpu.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/slice_op.h"
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/operators/slice_op.h"
 #include "xpu/refactor/math.h"
 
 namespace paddle {
@@ -53,8 +54,9 @@ class SliceXPUKernel : public framework::OpKernel<T> {
       start = std::max(start, 0);
       end = std::max(end, 0);
       end = std::min(end, dim_value);
-      PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument(
-                                        "end should greater than start"));
+      PADDLE_ENFORCE_GT(
+          end, start,
+          platform::errors::InvalidArgument("end should greater than start"));
       starts[i] = start;
       ends[i] = end;
     }
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
index c0318d344ae..05204354d09 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/smooth_l1_loss_op.h"
+
 #include <memory>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
index 136ea68ac9e..bdc46abff2a 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/smooth_l1_loss_op.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/smooth_l1_loss_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 3840b99dd17..7304467833a 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -61,8 +61,9 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()) ||
                             platform::is_xpu_place(ctx.GetPlace()),
-                        true, platform::errors::InvalidArgument(
-                                  "float16 can only be used on GPU/XPU place"));
+                        true,
+                        platform::errors::InvalidArgument(
+                            "float16 can only be used on GPU/XPU place"));
     }
 #endif
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
index c07467a9b0b..4b55f5af09d 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
@@ -44,8 +44,9 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
     Tensor* loss = context.Output<Tensor>("Loss");
     const int rank = logits->dims().size();
     const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
-    PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument(
-                                          "axis should == rank - 1"));
+    PADDLE_ENFORCE_EQ(
+        axis, rank - 1,
+        platform::errors::InvalidArgument("axis should == rank - 1"));
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
     const int n = phi::funcs::SizeToAxis(axis, logits->dims());
@@ -140,8 +141,9 @@ class SoftmaxWithCrossEntropyGradXPUKernel : public framework::OpKernel<T> {
 
     const int rank = logit_grad->dims().size();
     const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
-    PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument(
-                                          "axis should == rank - 1"));
+    PADDLE_ENFORCE_EQ(
+        axis, rank - 1,
+        platform::errors::InvalidArgument("axis should == rank - 1"));
     const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims());
     const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims());
 
diff --git a/paddle/fluid/operators/solve_op.cc b/paddle/fluid/operators/solve_op.cc
index 57302ae0342..4d23f1ce209 100644
--- a/paddle/fluid/operators/solve_op.cc
+++ b/paddle/fluid/operators/solve_op.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/solve_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h
index 7f3a5748666..928fbf755d7 100644
--- a/paddle/fluid/operators/solve_op.h
+++ b/paddle/fluid/operators/solve_op.h
@@ -92,9 +92,10 @@ static framework::DDim GetOutputShapeUnsqueeze(
   for (int axis : unsqz_dims) {
     int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
     // Vaildity Check: the axis bound
-    PADDLE_ENFORCE_GE(cur, 0, platform::errors::InvalidArgument(
-                                  "The insert dimension value should "
-                                  "not be less than 0"));
+    PADDLE_ENFORCE_GE(
+        cur, 0,
+        platform::errors::InvalidArgument("The insert dimension value should "
+                                          "not be less than 0"));
     PADDLE_ENFORCE_LE(cur, cur_output_size,
                       platform::errors::InvalidArgument(
                           "The insert dimension value shoule not be larger "
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
index 013467396b3..6a6972f3293 100644
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -38,8 +38,9 @@ class SpaceToDepthOp : public framework::OperatorWithKernel {
                        "Output(Out) of SpaceToDepthOp should not be null."));
 
     auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4, platform::errors::InvalidArgument(
-                                            "input should be a 4D tensor"));
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 4,
+        platform::errors::InvalidArgument("input should be a 4D tensor"));
     auto blocksize = ctx->Attrs().Get<int64_t>("blocksize");
 
     PADDLE_ENFORCE_GT(blocksize, 1,
diff --git a/paddle/fluid/operators/sparse_attention_op.cc b/paddle/fluid/operators/sparse_attention_op.cc
index a6534543a65..14d1ffe3f11 100644
--- a/paddle/fluid/operators/sparse_attention_op.cc
+++ b/paddle/fluid/operators/sparse_attention_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu
index 49f8263ab28..2949642d2f3 100644
--- a/paddle/fluid/operators/sparse_attention_op.cu
+++ b/paddle/fluid/operators/sparse_attention_op.cu
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <math.h>
+
 #include <limits>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #if defined(PADDLE_WITH_CUDA)
@@ -90,17 +92,15 @@ __global__ void BlockSparseSoftmaxForward(T* softmax, const T* src, T scale,
       if (cur_block_col < cur_block_nnz) {
         // read kp mask
         T cur_kp_mask;
-        if ((kp_mask != nullptr) &&
-            std::abs(kp_mask[colindex[cur_block_col]]) <
-                std::numeric_limits<T>::epsilon()) {
+        if ((kp_mask != nullptr) && std::abs(kp_mask[colindex[cur_block_col]]) <
+                                        std::numeric_limits<T>::epsilon()) {
           cur_kp_mask = -std::numeric_limits<T>::infinity();
         } else {
           cur_kp_mask = 0;
         }
         // do mask operation
-        if ((attnptr != nullptr) &&
-            std::abs(attnptr[colindex[cur_block_col]]) <
-                std::numeric_limits<T>::epsilon()) {
+        if ((attnptr != nullptr) && std::abs(attnptr[colindex[cur_block_col]]) <
+                                        std::numeric_limits<T>::epsilon()) {
           srcdata[cur_reg_index] =
               -std::numeric_limits<T>::infinity() * scale + cur_kp_mask;
         } else {
@@ -280,37 +280,37 @@ void SparseSoftmaxBackward(const platform::CUDADeviceContext& ctx,
   T scaling = static_cast<T>(1.0) / sqrt(static_cast<T>(num_cols));
 
   if (num_cols <= 4) {
-    BlockSparseSoftmaxBackward<T, block_size, 4><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 4>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 4 && num_cols <= 8) {
-    BlockSparseSoftmaxBackward<T, block_size, 8><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 8>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 8 && num_cols <= 16) {
-    BlockSparseSoftmaxBackward<T, block_size, 16><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 16>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 16 && num_cols <= 32) {
-    BlockSparseSoftmaxBackward<T, block_size, 32><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 32>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 32 && num_cols <= 64) {
-    BlockSparseSoftmaxBackward<T, block_size, 64><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 64>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 64 && num_cols <= 128) {
-    BlockSparseSoftmaxBackward<T, block_size, 128><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 128>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 128 && num_cols <= 256) {
-    BlockSparseSoftmaxBackward<T, block_size, 256><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 256>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else if (num_cols > 256 && num_cols <= 512) {
-    BlockSparseSoftmaxBackward<T, block_size, 512><<<grid, blocks>>>(
-        dx_data, dout_data, out_data, scaling, offset_data, columns_data,
-        num_rows);
+    BlockSparseSoftmaxBackward<T, block_size, 512>
+        <<<grid, blocks>>>(dx_data, dout_data, out_data, scaling, offset_data,
+                           columns_data, num_rows);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "The head_dim of query in sparse_attention op should less or equal "
diff --git a/paddle/fluid/operators/spectral_norm_op.h b/paddle/fluid/operators/spectral_norm_op.h
index ee75c96c23a..765b9a4dbfa 100644
--- a/paddle/fluid/operators/spectral_norm_op.h
+++ b/paddle/fluid/operators/spectral_norm_op.h
@@ -11,6 +11,7 @@
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc
index 0270f7e0576..cd2053b4ef0 100644
--- a/paddle/fluid/operators/spectral_op.cc
+++ b/paddle/fluid/operators/spectral_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/spectral_op.h"
+
 #include "paddle/fluid/operators/spectral_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h
index 71b54caf5ee..4900e88fbe1 100644
--- a/paddle/fluid/operators/spectral_op.h
+++ b/paddle/fluid/operators/spectral_op.h
@@ -18,6 +18,7 @@
 #include <numeric>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index dc20952903a..6c60c1a17e0 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/split_op.h"
+
 #include <string>
 
 #include "paddle/fluid/framework/infershape_utils.h"
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index cf44f341b2b..143e1d72868 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/kernels/split_kernel.h"
diff --git a/paddle/fluid/operators/split_op_mlu.cc b/paddle/fluid/operators/split_op_mlu.cc
index adc3ea14e32..0d438854673 100644
--- a/paddle/fluid/operators/split_op_mlu.cc
+++ b/paddle/fluid/operators/split_op_mlu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/split_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/split_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/split_op_xpu.cc b/paddle/fluid/operators/split_op_xpu.cc
index 8f02d8157b2..b24d0a70b05 100644
--- a/paddle/fluid/operators/split_op_xpu.cc
+++ b/paddle/fluid/operators/split_op_xpu.cc
@@ -12,9 +12,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/split_op.h"
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/operators/split_op.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
index b1e0127f4cf..05230399b30 100644
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/spp_op.h"
+
 #include <string>
 #include <vector>
 namespace paddle {
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index aa944cfcfbb..cd81ade1f9d 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc
index 6fc80ca379f..6b0a0657afb 100644
--- a/paddle/fluid/operators/stack_op.cc
+++ b/paddle/fluid/operators/stack_op.cc
@@ -14,6 +14,7 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc
index 9d4ef0ffa20..c3e6e333e4c 100644
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
@@ -30,8 +30,9 @@ class StackNPUKernel : public framework::OpKernel<T> {
     if (axis < 0) axis += (x[0]->dims().size() + 1);
     int num = static_cast<int>(x.size());
 
-    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
-                                  "number of input Tensor <= 0"));
+    PADDLE_ENFORCE_GT(
+        num, 0,
+        platform::errors::InvalidArgument("number of input Tensor <= 0"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -59,8 +60,9 @@ class StackGradNPUKernel : public framework::OpKernel<T> {
     if (axis < 0) axis += dy->dims().size();
     int num = dy->dims()[axis];
 
-    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
-                                  "number of input Tensor <= 0"));
+    PADDLE_ENFORCE_GT(
+        num, 0,
+        platform::errors::InvalidArgument("number of input Tensor <= 0"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc
index baaa2b4884c..925fcc08615 100644
--- a/paddle/fluid/operators/stack_op_xpu.cc
+++ b/paddle/fluid/operators/stack_op_xpu.cc
@@ -15,6 +15,7 @@
 #ifdef PADDLE_WITH_XPU
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/concat_op.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc
index 7d4103ddf38..36e86741729 100644
--- a/paddle/fluid/operators/stft_op.cc
+++ b/paddle/fluid/operators/stft_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/stft_op.h"
+
 #include "paddle/fluid/operators/spectral_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/stft_op.h b/paddle/fluid/operators/stft_op.h
index e75c59232bc..cc17ed9a43c 100644
--- a/paddle/fluid/operators/stft_op.h
+++ b/paddle/fluid/operators/stft_op.h
@@ -17,7 +17,6 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/frame_op.h"
 #include "paddle/fluid/operators/spectral_op.h"
diff --git a/paddle/fluid/operators/strided_slice_op_npu.cc b/paddle/fluid/operators/strided_slice_op_npu.cc
index b142b8f099b..80952e9b556 100644
--- a/paddle/fluid/operators/strided_slice_op_npu.cc
+++ b/paddle/fluid/operators/strided_slice_op_npu.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/funcs/strided_slice.h"
 #include "paddle/fluid/operators/slice_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/strided_slice.h"
 
 namespace paddle {
 namespace operators {
@@ -186,14 +186,16 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
     out->Resize(out_dims);
     out->mutable_data<T>(place);
 
-    const auto& runner = NpuOpRunner(
-        "StridedSlice", {*in, starts_indices_tensor, ends_indices_tensor,
-                         strides_indices_tensor},
-        {*out}, {{"begin_mask", 0},
-                 {"end_mask", 0},
-                 {"ellipsis_mask", 0},
-                 {"new_axis_mask", 0},
-                 {"shrink_axis_mask", 0}});
+    const auto& runner =
+        NpuOpRunner("StridedSlice",
+                    {*in, starts_indices_tensor, ends_indices_tensor,
+                     strides_indices_tensor},
+                    {*out},
+                    {{"begin_mask", 0},
+                     {"end_mask", 0},
+                     {"ellipsis_mask", 0},
+                     {"new_axis_mask", 0},
+                     {"shrink_axis_mask", 0}});
     runner.Run(stream);
 
     if (need_reverse) {
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc
index 42047021b40..9e4089680f4 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.cc
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc
@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/string/faster_tokenizer_op.h"
+
 #include <utf8proc.h>
 
 #include <algorithm>
+#include <boost/algorithm/string.hpp>
 #include <chrono>
 #include <codecvt>
 #include <fstream>
@@ -22,10 +25,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 
-#include <boost/algorithm/string.hpp>
-
 #include "paddle/fluid/framework/string_array.h"
-#include "paddle/fluid/operators/string/faster_tokenizer_op.h"
 
 namespace paddle {
 namespace operators {
@@ -38,12 +38,11 @@ using std::ifstream;
 using std::int64_t;
 using std::min;
 using std::runtime_error;
-using std::unordered_map;
-using std::unordered_set;
 using std::shared_ptr;
 using std::size_t;
-using std::int64_t;
 using std::string;
+using std::unordered_map;
+using std::unordered_set;
 using std::vector;
 using std::wstring;
 
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h
index 446be3a1999..a6b8bfea59c 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.h
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.h
@@ -26,15 +26,14 @@ namespace operators {
 
 using std::endl;
 using std::int64_t;
+using std::shared_ptr;
 using std::size_t;
 using std::string;
-using std::shared_ptr;
-using std::vector;
 using std::unordered_map;
 using std::unordered_set;
 using std::vector;
-using std::wstring;
 using std::wcout;
+using std::wstring;
 
 inline bool IsControl(const wchar_t& ch);
 inline bool IsChineseChar(const wchar_t& ch);
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 51040544fac..bc6997e36eb 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -36,9 +36,8 @@ class SumOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "sum");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "sum");
 
-    if (ctx->IsRuntime() &&
-        ctx->GetOutputsVarType("Out")[0] ==
-            framework::proto::VarType::LOD_TENSOR_ARRAY) {
+    if (ctx->IsRuntime() && ctx->GetOutputsVarType("Out")[0] ==
+                                framework::proto::VarType::LOD_TENSOR_ARRAY) {
       return;  // skip runtime infershape when is tensor array;
     }
 
@@ -47,11 +46,12 @@ class SumOp : public framework::OperatorWithKernel {
 
     auto N = x_dims.size();
     PADDLE_ENFORCE_GT(
-        N, 0, platform::errors::InvalidArgument(
-                  "The input tensor X's dimensions of SumOp "
-                  "should be larger than 0. But received X's dimensions %d, "
-                  "X's shape = [%s].",
-                  N, &x_dims));
+        N, 0,
+        platform::errors::InvalidArgument(
+            "The input tensor X's dimensions of SumOp "
+            "should be larger than 0. But received X's dimensions %d, "
+            "X's shape = [%s].",
+            N, &x_dims));
     if (N == 1) {
       VLOG(3) << "Warning: SumOp have only one input, may waste memory";
     }
@@ -115,8 +115,9 @@ class SumOp : public framework::OperatorWithKernel {
     framework::LibraryType library{framework::LibraryType::kPlain};
     framework::DataLayout layout{framework::DataLayout::kAnyLayout};
 
-    PADDLE_ENFORCE_GT(x_vars.size(), 0, platform::errors::InvalidArgument(
-                                            "Input[X] should not be empty"));
+    PADDLE_ENFORCE_GT(
+        x_vars.size(), 0,
+        platform::errors::InvalidArgument("Input[X] should not be empty"));
 
     PADDLE_ENFORCE_NOT_NULL(
         x_vars[0], platform::errors::NotFound(
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 8c6c083cde8..3bf249425c2 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/sum_op.h"
@@ -205,8 +206,8 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
           reinterpret_cast<T **>(tmp_sr_in_out_array->ptr());
 
       ComputeKernelParameter(length);
-      SumSelectedRowsCUDAKernel<T><<<grids, blocks, 0, stream>>>(
-          sr_in_out_array_data, length, rows);
+      SumSelectedRowsCUDAKernel<T>
+          <<<grids, blocks, 0, stream>>>(sr_in_out_array_data, length, rows);
       dst_write = true;
     }
   }
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 3c51b3398be..8c1e3a3dbf1 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/sum_op_mlu.cc b/paddle/fluid/operators/sum_op_mlu.cc
index 179c038e837..68e31c364b6 100644
--- a/paddle/fluid/operators/sum_op_mlu.cc
+++ b/paddle/fluid/operators/sum_op_mlu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/sum_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/sum_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc
index 5899591549e..a1cdaddd11b 100644
--- a/paddle/fluid/operators/sum_op_xpu.cc
+++ b/paddle/fluid/operators/sum_op_xpu.cc
@@ -11,8 +11,9 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/sum_op.h"
 #include <vector>
+
+#include "paddle/fluid/operators/sum_op.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index 166f49999d5..468c658e5e6 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -15,9 +15,11 @@
 #pragma once
 
 #include <Eigen/src/Core/util/Constants.h>
+
 #include <Eigen/Dense>
 #include <Eigen/SVD>
 #include <iostream>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/diag_op.h"
@@ -101,20 +103,22 @@ struct RealMulComplexFunctor {
   // y: complex number (c+0j) pretend to be a real number
   // out: complex number (ac+bcj)
   inline HOSTDEVICE T operator()(T x, T y) {
-    PADDLE_ENFORCE_LT(y.imag, 1e-6, platform::errors::InvalidArgument(
-                                        "The image part of y must to be 0"
-                                        "but got [%d]",
-                                        y.imag));
+    PADDLE_ENFORCE_LT(
+        y.imag, 1e-6,
+        platform::errors::InvalidArgument("The image part of y must to be 0"
+                                          "but got [%d]",
+                                          y.imag));
     return platform::complex<phi::dtype::Real<T>>(x.real * y.real,
                                                   x.imag * y.real);
   }
 };
 
 static std::vector<int> GetBroadcastShape(InTensors ins) {
-  PADDLE_ENFORCE_EQ(ins.size(), 2, platform::errors::InvalidArgument(
-                                       "GetBroadcastShape Receive 2 tensors"
-                                       "but got [%d]",
-                                       ins.size()));
+  PADDLE_ENFORCE_EQ(
+      ins.size(), 2,
+      platform::errors::InvalidArgument("GetBroadcastShape Receive 2 tensors"
+                                        "but got [%d]",
+                                        ins.size()));
   auto x_dim = ins[0]->dims();
   auto y_dim = ins[1]->dims();
   std::vector<int> broadcast_shape =
@@ -596,8 +600,9 @@ struct DeviceIndependenceTensorOperations {
     attrs["lower"] = lower;
     NameInTensorMap inputs({{"X", {&x}}});
     int x_rank = x.dims().size();
-    PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument(
-                                     "Rank must be at least 2."));
+    PADDLE_ENFORCE_GE(
+        x_rank, 2,
+        platform::errors::InvalidArgument("Rank must be at least 2."));
     std::vector<int> out_shape = phi::vectorize<int>(x.dims());
     return CreateOpRunAndReturnTensor("tril_triu", inputs, attrs, out_shape);
   }
diff --git a/paddle/fluid/operators/svd_op.cc b/paddle/fluid/operators/svd_op.cc
index 3ca7320114a..e68b013d2fb 100644
--- a/paddle/fluid/operators/svd_op.cc
+++ b/paddle/fluid/operators/svd_op.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/svd_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/phi/core/ddim.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
diff --git a/paddle/fluid/operators/svd_op.cu b/paddle/fluid/operators/svd_op.cu
index e987589e83c..317ea7c5363 100644
--- a/paddle/fluid/operators/svd_op.cu
+++ b/paddle/fluid/operators/svd_op.cu
@@ -16,8 +16,10 @@ limitations under the License. */
 // HIP not support cusolver
 
 #include <thrust/device_vector.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/svd_op.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h
index 42a847206a3..1008a69e6de 100644
--- a/paddle/fluid/operators/svd_op.h
+++ b/paddle/fluid/operators/svd_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <cstdarg>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/svd_helper.h"
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h
index 17c96544988..9818aa3651b 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
@@ -137,7 +137,7 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
                           const float momentum, const bool is_test,
                           const bool use_global_stats
 
-                          ) {
+) {
   const auto &x_dims = x->dims();
   PADDLE_ENFORCE_GE(x_dims.size(), 2,
                     platform::errors::InvalidArgument(
@@ -178,13 +178,11 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
     const int threads = 256;
     int grid = std::min(C, (max_threads + threads - 1) / threads);
     if (layout == framework::DataLayout::kNCHW) {
-      KeLocalStats<T, threads,
-                   framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
-          x_d, N, H * W * D, C, stats);
+      KeLocalStats<T, threads, framework::DataLayout::kNCHW>
+          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
     } else {
-      KeLocalStats<T, threads,
-                   framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
-          x_d, N, H * W * D, C, stats);
+      KeLocalStats<T, threads, framework::DataLayout::kNHWC>
+          <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
     }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -221,13 +219,13 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
 
   int grid2 = (std::min(x_numel, max_threads) + block - 1) / block;
   if (layout == framework::DataLayout::kNCHW) {
-    KeNormAffine<T, framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-        x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel,
-        y_d);
+    KeNormAffine<T, framework::DataLayout::kNCHW>
+        <<<grid2, block, 0, stream>>>(x_d, s_d, b_d, mean_data, var_data,
+                                      epsilon, C, H * W * D, x_numel, y_d);
   } else {
-    KeNormAffine<T, framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-        x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel,
-        y_d);
+    KeNormAffine<T, framework::DataLayout::kNHWC>
+        <<<grid2, block, 0, stream>>>(x_d, s_d, b_d, mean_data, var_data,
+                                      epsilon, C, H * W * D, x_numel, y_d);
   }
 }
 
@@ -436,30 +434,30 @@ void SyncBatchNormGradFunctor(
 
   if (is_inplace) {
     if (layout == framework::DataLayout::kNCHW) {
-      KeBNRestoreData<
-          T, framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-          px.mutable_data<T>(ctx.GetPlace()),
-          scale->data<BatchNormParamType<T>>(),
-          bias->data<BatchNormParamType<T>>(), saved_mean, saved_inv_var,
-          epsilon, C, H * W * D, x_numel, x->data<T>());
+      KeBNRestoreData<T, framework::DataLayout::kNCHW>
+          <<<grid2, block, 0, stream>>>(px.mutable_data<T>(ctx.GetPlace()),
+                                        scale->data<BatchNormParamType<T>>(),
+                                        bias->data<BatchNormParamType<T>>(),
+                                        saved_mean, saved_inv_var, epsilon, C,
+                                        H * W * D, x_numel, x->data<T>());
     } else {
-      KeBNRestoreData<
-          T, framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-          px.mutable_data<T>(ctx.GetPlace()),
-          scale->data<BatchNormParamType<T>>(),
-          bias->data<BatchNormParamType<T>>(), saved_mean, saved_inv_var,
-          epsilon, C, H * W * D, x_numel, x->data<T>());
+      KeBNRestoreData<T, framework::DataLayout::kNHWC>
+          <<<grid2, block, 0, stream>>>(px.mutable_data<T>(ctx.GetPlace()),
+                                        scale->data<BatchNormParamType<T>>(),
+                                        bias->data<BatchNormParamType<T>>(),
+                                        saved_mean, saved_inv_var, epsilon, C,
+                                        H * W * D, x_numel, x->data<T>());
     }
   }
 
   if (layout == framework::DataLayout::kNCHW) {
-    KeBackwardLocalStats<
-        T, threads, framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
-        dy_d, x_d, saved_mean, N, fsize, C, stats);
+    KeBackwardLocalStats<T, threads, framework::DataLayout::kNCHW>
+        <<<grid, threads, 0, stream>>>(dy_d, x_d, saved_mean, N, fsize, C,
+                                       stats);
   } else {
-    KeBackwardLocalStats<
-        T, threads, framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
-        dy_d, x_d, saved_mean, N, fsize, C, stats);
+    KeBackwardLocalStats<T, threads, framework::DataLayout::kNHWC>
+        <<<grid, threads, 0, stream>>>(dy_d, x_d, saved_mean, N, fsize, C,
+                                       stats);
   }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -476,35 +474,33 @@ void SyncBatchNormGradFunctor(
 
   if (layout == framework::DataLayout::kNCHW) {
     if (d_scale && d_bias) {
-      KeBNBackwardScaleBias<
-          T, threads,
-          framework::DataLayout::kNCHW><<<grid, threads, 0, stream>>>(
-          dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
-          d_scale->data<BatchNormParamType<T>>(),
-          d_bias->data<BatchNormParamType<T>>());
+      KeBNBackwardScaleBias<T, threads, framework::DataLayout::kNCHW>
+          <<<grid, threads, 0, stream>>>(dy_d, x_d, saved_mean, saved_inv_var,
+                                         epsilon, N, C, fsize,
+                                         d_scale->data<BatchNormParamType<T>>(),
+                                         d_bias->data<BatchNormParamType<T>>());
     }
     if (d_x) {
-      KeBNBackwardData<
-          T, framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-          dy_d, x_d, scale->data<BatchNormParamType<T>>(), saved_mean,
-          saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, fsize,
-          x->numel(), d_x->data<T>());
+      KeBNBackwardData<T, framework::DataLayout::kNCHW>
+          <<<grid2, block, 0, stream>>>(
+              dy_d, x_d, scale->data<BatchNormParamType<T>>(), saved_mean,
+              saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, fsize,
+              x->numel(), d_x->data<T>());
     }
   } else {
     if (d_scale && d_bias) {
-      KeBNBackwardScaleBias<
-          T, threads,
-          framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
-          dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize,
-          d_scale->data<BatchNormParamType<T>>(),
-          d_bias->data<BatchNormParamType<T>>());
+      KeBNBackwardScaleBias<T, threads, framework::DataLayout::kNHWC>
+          <<<grid, threads, 0, stream>>>(dy_d, x_d, saved_mean, saved_inv_var,
+                                         epsilon, N, C, fsize,
+                                         d_scale->data<BatchNormParamType<T>>(),
+                                         d_bias->data<BatchNormParamType<T>>());
     }
     if (d_x) {
-      KeBNBackwardData<
-          T, framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-          dy_d, x_d, scale->data<BatchNormParamType<T>>(), saved_mean,
-          saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, fsize,
-          x->numel(), d_x->data<T>());
+      KeBNBackwardData<T, framework::DataLayout::kNHWC>
+          <<<grid2, block, 0, stream>>>(
+              dy_d, x_d, scale->data<BatchNormParamType<T>>(), saved_mean,
+              saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, fsize,
+              x->numel(), d_x->data<T>());
     }
   }
 }
diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
index b5632f4fe4a..604f8f97a3f 100644
--- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
@@ -566,8 +566,9 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       paddle::framework::TensorToVector(
           device_count_tensor, ctx.device_context(), &device_count_vec);
       device_counts = device_count_vec[0];
-      PADDLE_ENFORCE_GE(device_counts, 2, platform::errors::PreconditionNotMet(
-                                              "device_counts should >= 2."));
+      PADDLE_ENFORCE_GE(
+          device_counts, 2,
+          platform::errors::PreconditionNotMet("device_counts should >= 2."));
     }
 
     // cacl var_ref
diff --git a/paddle/fluid/operators/tdm_child_op.cc b/paddle/fluid/operators/tdm_child_op.cc
index a60fc537e32..a7a218972ec 100644
--- a/paddle/fluid/operators/tdm_child_op.cc
+++ b/paddle/fluid/operators/tdm_child_op.cc
@@ -13,7 +13,9 @@
  limitations under the License. */
 
 #include "paddle/fluid/operators/tdm_child_op.h"
+
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/sampler.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
index e437975320c..c39d8260a8b 100644
--- a/paddle/fluid/operators/tdm_child_op.h
+++ b/paddle/fluid/operators/tdm_child_op.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/tdm_sampler_op.cc b/paddle/fluid/operators/tdm_sampler_op.cc
index 6aad72a0d9c..68d079e6793 100644
--- a/paddle/fluid/operators/tdm_sampler_op.cc
+++ b/paddle/fluid/operators/tdm_sampler_op.cc
@@ -13,7 +13,9 @@
  limitations under the License. */
 
 #include "paddle/fluid/operators/tdm_sampler_op.h"
+
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/sampler.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/tdm_sampler_op.h b/paddle/fluid/operators/tdm_sampler_op.h
index bf752a9c8ad..c3ed90ae68e 100644
--- a/paddle/fluid/operators/tdm_sampler_op.h
+++ b/paddle/fluid/operators/tdm_sampler_op.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc
index 3bdb9cb972f..12d0f288d97 100644
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
@@ -10,12 +10,13 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/temporal_shift_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
index 1d7aeec142f..f8e642cdb89 100644
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -179,13 +179,13 @@ class TemporalShiftOpCUDAKernel : public framework::OpKernel<T> {
     grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
 
     if (data_layout == DataLayout::kNCHW) {
-      KeTemporalShiftFwNCHW<
-          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2);
+      KeTemporalShiftFwNCHW<T>
+          <<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+              input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2);
     } else {
-      KeTemporalShiftFwNHWC<
-          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          input_data, output_data, ntchw, tchw, chw, t, c, c1, c2);
+      KeTemporalShiftFwNHWC<T>
+          <<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+              input_data, output_data, ntchw, tchw, chw, t, c, c1, c2);
     }
   }
 };
@@ -233,13 +233,15 @@ class TemporalShiftGradOpCUDAKernel : public framework::OpKernel<T> {
     grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
 
     if (data_layout == DataLayout::kNCHW) {
-      KeTemporalShiftBwNCHW<
-          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2);
+      KeTemporalShiftBwNCHW<T>
+          <<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+              output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1,
+              c2);
     } else {
-      KeTemporalShiftBwNHWC<
-          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2);
+      KeTemporalShiftBwNHWC<T>
+          <<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+              output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1,
+              c2);
     }
   }
 };
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index 95ae32fa2ea..41d1fc2356e 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -96,10 +96,11 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
         *scope.FindVar(Output("OutIndex"))->GetMutable<framework::LoDTensor>();
 
     const size_t n = inx.size();
-    PADDLE_ENFORCE_GT(n, 0, platform::errors::InvalidArgument(
-                                "Input tensorarray size should > 0,"
-                                "but the received is %d",
-                                n));
+    PADDLE_ENFORCE_GT(
+        n, 0,
+        platform::errors::InvalidArgument("Input tensorarray size should > 0,"
+                                          "but the received is %d",
+                                          n));
 
     std::string base_name = Inputs("X")[0];
     std::vector<std::string> names;
@@ -235,10 +236,11 @@ class LoDTensorArray2TensorGradOp : public framework::OperatorBase {
 
     auto &inx = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     const size_t n = inx.size();
-    PADDLE_ENFORCE_GT(n, 0, platform::errors::InvalidArgument(
-                                "Input tensorarray size should > 0, "
-                                "but the received is: %d. ",
-                                n));
+    PADDLE_ENFORCE_GT(
+        n, 0,
+        platform::errors::InvalidArgument("Input tensorarray size should > 0, "
+                                          "but the received is: %d. ",
+                                          n));
 
     std::string base_name = Inputs("X")[0];
     std::vector<std::string> names;
diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc
index ef46ee25156..8f02bc870e2 100644
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/operators/tensor_formatter.h"
 
 #include <string>
+
 #include "paddle/fluid/framework/convert_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tensor_to_string.h b/paddle/fluid/operators/tensor_to_string.h
index bd9e7f6219b..c1ca1dff9ff 100644
--- a/paddle/fluid/operators/tensor_to_string.h
+++ b/paddle/fluid/operators/tensor_to_string.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <sstream>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -56,7 +57,7 @@ static std::vector<T> ToVector(const framework::Tensor &src) {
 }
 
 template <typename... Args>
-static std::string FlattenToString(Args &&... args) {
+static std::string FlattenToString(Args &&...args) {
   const auto &vec = ToVector(std::forward<Args>(args)...);
   return "[" + string::join_strings(vec, ',') + "]";
 }
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 0a71875d893..1e5ce6fa3e8 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -52,9 +52,9 @@ namespace operators {
 
 using inference::Singleton;
 using inference::tensorrt::TensorRTEngine;
-using inference::tensorrt::TRTInt8Calibrator;
 using inference::tensorrt::TRTCalibratorEngine;
 using inference::tensorrt::TRTCalibratorEngineManager;
+using inference::tensorrt::TRTInt8Calibrator;
 
 static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
                                     std::vector<int64_t> model_input_shape) {
@@ -111,10 +111,10 @@ static void RuntimeDynamicShapeCheck(
   //         "TRT engine runtime input %s dims size(%d) inconsistent "
   //         "with the dynamic shape size(%d)",
   //         x, runtime_input_shape.size(), min_input_shape.size()));
-  auto is_input_shape_valid = [&](
-      const std::vector<int32_t> &runtime_input_shape,
-      const std::vector<int32_t> &min_input_shape,
-      const std::vector<int32_t> &max_input_shape) -> bool {
+  auto is_input_shape_valid =
+      [&](const std::vector<int32_t> &runtime_input_shape,
+          const std::vector<int32_t> &min_input_shape,
+          const std::vector<int32_t> &max_input_shape) -> bool {
     for (size_t i = 0; i < runtime_input_shape.size(); i++) {
       if (runtime_input_shape[i] <= max_input_shape[i] &&
           runtime_input_shape[i] >= min_input_shape[i]) {
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 243ae757277..c4278cfeb58 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
+
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_desc.h"
diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc
index cea6b458aec..ee2d38fea70 100644
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ b/paddle/fluid/operators/tile_op_npu.cc
@@ -27,10 +27,11 @@ class TileNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto rank = context.Input<Tensor>("X")->dims().size();
     PADDLE_ENFORCE_GE(
-        rank, 1, platform::errors::InvalidArgument(
-                     "The rank of the input 'x' for tile op must be a positive "
-                     "integer, but the value received is %d.",
-                     rank));
+        rank, 1,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'x' for tile op must be a positive "
+            "integer, but the value received is %d.",
+            rank));
     PADDLE_ENFORCE_LE(
         rank, MAX_RANK_SUPPORTED,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/tile_op_xpu.cc b/paddle/fluid/operators/tile_op_xpu.cc
index 598377587d6..a0ce4a2bebe 100644
--- a/paddle/fluid/operators/tile_op_xpu.cc
+++ b/paddle/fluid/operators/tile_op_xpu.cc
@@ -25,10 +25,11 @@ class TileXPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto rank = context.Input<Tensor>("X")->dims().size();
     PADDLE_ENFORCE_GE(
-        rank, 1, platform::errors::InvalidArgument(
-                     "The rank of the input 'x' for tile op must be a positive "
-                     "integer, but the value received is %d.",
-                     rank));
+        rank, 1,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'x' for tile op must be a positive "
+            "integer, but the value received is %d.",
+            rank));
     PADDLE_ENFORCE_LE(
         rank, MAX_RANK_SUPPORTED,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index 848ab1cb556..a7981c86c45 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <stdio.h>
+
 #include <cstdio>
 #include <vector>
 #ifdef __NVCC__
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index cce5ad26317..d8fc129588a 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/top_k_op.h"
+
 #include <memory>
 
 namespace paddle {
@@ -39,8 +40,9 @@ class TopkOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(k, 1,
                       platform::errors::InvalidArgument(
                           "Attribute k must be >= 1, but got k is %d.", k));
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
-                                                "input must have >= 1d shape"));
+    PADDLE_ENFORCE_GE(
+        input_dims.size(), 1,
+        platform::errors::InvalidArgument("input must have >= 1d shape"));
 
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_GE(
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 30a5a802a53..fc8f08ca480 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -96,8 +96,8 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
     switch (GetDesiredBlockDim(input_width)) {
       FIXED_BLOCK_DIM(
-          KeMatrixTopK<T, 5,
-                       kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          KeMatrixTopK<T, 5, kBlockDim>
+          <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
               output_data, k, indices_data, input_data, input_width,
               input_width, static_cast<int>(k), gridx, input_height));
       default:
@@ -133,8 +133,8 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
     int gridx = row < kMaxHeight ? row : kMaxHeight;
     switch (GetDesiredBlockDim(col)) {
       FIXED_BLOCK_DIM(
-          AssignGrad<T, 5,
-                     kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          AssignGrad<T, 5, kBlockDim>
+          <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
               x_grad_data, indices_data, out_grad_data, row, col, k));
       default:
         PADDLE_THROW(
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index aad2f096a53..9d933eb5c47 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <iostream>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/top_k_op_mlu.cc b/paddle/fluid/operators/top_k_op_mlu.cc
index 102902bdaaa..16b2ac9807e 100644
--- a/paddle/fluid/operators/top_k_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_op_mlu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/top_k_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
index 04e4d88b008..051cb9611ba 100644
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc
index c6c0fa3c001..36ad2d74869 100644
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
@@ -118,19 +118,16 @@ REGISTER_OPERATOR(trace_grad, ops::TraceGradOp,
                   ops::TraceGradNoNeedBufferVarsInferer);
 
 /* ==========================  register checkpoint ===========================*/
-REGISTER_OP_VERSION(trace)
-    .AddCheckpoint(
-        R"ROC(Upgrade trace add a new attribute [axis2])ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr("axis1",
-                     "The added attribute 'axis1' is not yet registered.",
-                     std::vector<float>{0.0f})
-            .NewAttr("axis2",
-                     "The added attribute 'axis2' is not yet registered.",
-                     std::vector<float>{1.0f})
-            .DeleteAttr("dim1",
-                        "The attribute 'dim1' is not recommend according to "
-                        "the specification 2.0.")
-            .DeleteAttr("dim2",
-                        "The attribute 'dim2' is not recommend according to "
-                        "the specification 2.0."));
+REGISTER_OP_VERSION(trace).AddCheckpoint(
+    R"ROC(Upgrade trace add a new attribute [axis2])ROC",
+    paddle::framework::compatible::OpVersionDesc()
+        .NewAttr("axis1", "The added attribute 'axis1' is not yet registered.",
+                 std::vector<float>{0.0f})
+        .NewAttr("axis2", "The added attribute 'axis2' is not yet registered.",
+                 std::vector<float>{1.0f})
+        .DeleteAttr("dim1",
+                    "The attribute 'dim1' is not recommend according to "
+                    "the specification 2.0.")
+        .DeleteAttr("dim2",
+                    "The attribute 'dim2' is not recommend according to "
+                    "the specification 2.0."));
diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc
index f26bcdca4a7..3b55631900d 100644
--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ b/paddle/fluid/operators/transfer_layout_op.cc
@@ -146,7 +146,7 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL_FUNCTOR(transfer_layout, float,
                                ops::TransferLayoutKernel);
 REGISTER_OP_VERSION(transfer_layout)
-    .AddCheckpoint(
-        R"ROC(refine transfer_layout, add src_layout attribute)ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "src_layout", "(int, the layout of the input tensor", -1));
+    .AddCheckpoint(R"ROC(refine transfer_layout, add src_layout attribute)ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewAttr(
+                       "src_layout", "(int, the layout of the input tensor",
+                       -1));
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index a45d32b34b9..4eceb69e8ce 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/transpose_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
index a31ac28c991..40a967b11f7 100644
--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -96,12 +96,15 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims,
   int x = threadIdx.x;
 
   Dim3 output_dims = {
-      input_dims[0], input_dims[2], input_dims[1],
+      input_dims[0],
+      input_dims[2],
+      input_dims[1],
   };
 
   // Align dim to Tiles
   Dim3 tile_aligned_input_dim = {
-      input_dims[0], (input_dims[1] + TileX - 1) / TileX,
+      input_dims[0],
+      (input_dims[1] + TileX - 1) / TileX,
       (input_dims[2] + TileY - 1) / TileY,
   };
 
@@ -111,7 +114,8 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims,
 
   // Compute real index align to tile:0, 32, 64...
   Index3 block_tile_index_in_input = {
-      input_block_tile_index[0], input_block_tile_index[1] * TileX,
+      input_block_tile_index[0],
+      input_block_tile_index[1] * TileX,
       input_block_tile_index[2] * TileY,
   };
 
@@ -165,12 +169,14 @@ __global__ void TilingSwapDim1And2(const T* __restrict__ input, Dim3 input_dims,
 
   // Store sm value back to out
   Index3 output_block_tile_index = {
-      input_block_tile_index[0], input_block_tile_index[2],
+      input_block_tile_index[0],
+      input_block_tile_index[2],
       input_block_tile_index[1],
   };
 
   Index3 block_tile_index_in_output = {
-      output_block_tile_index[0], output_block_tile_index[1] * TileY,
+      output_block_tile_index[0],
+      output_block_tile_index[1] * TileY,
       output_block_tile_index[2] * TileX,
   };
 
@@ -265,15 +271,13 @@ void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d, int tile_size_i,
                                       T* output) {
   constexpr int NumThreads = tile_long;
   if (tile_size_i <= tile_long && tile_size_j <= tile_short) {
-    TilingSwapDim1And2<
-        T, NumThreads, tile_long,
-        tile_short><<<total_tiles_count, NumThreads, 0, d.stream()>>>(
-        input, input_dims, output);
+    TilingSwapDim1And2<T, NumThreads, tile_long, tile_short>
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
+                                                           output);
   } else {
-    TilingSwapDim1And2<
-        T, NumThreads, tile_short,
-        tile_long><<<total_tiles_count, NumThreads, 0, d.stream()>>>(
-        input, input_dims, output);
+    TilingSwapDim1And2<T, NumThreads, tile_short, tile_long>
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
+                                                           output);
   }
 }
 
@@ -392,10 +396,10 @@ void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input,
     // data may not aligned to tile, so some threads wasted, we need
     // to find least wasted threads, which means we need to find tile
     // can split input properly, in another words: num_wasted_threads=0.
-    int num_wasted_threads = input_long_edge -
-                             framework::CeilOrFloor<int, false>(
-                                 input_long_edge, proposed_tile_long_edge) *
-                                 proposed_tile_long_edge;
+    int num_wasted_threads =
+        input_long_edge - framework::CeilOrFloor<int, false>(
+                              input_long_edge, proposed_tile_long_edge) *
+                              proposed_tile_long_edge;
 
     int num_full_tiles = framework::CeilOrFloor<int, false>(
         input_long_edge, proposed_tile_long_edge);
@@ -499,10 +503,9 @@ void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
     int total_tiles_count =
         input_dims_aligned[0] * input_dims_aligned[1] * input_dims_aligned[2];
 
-    TilingSwapDim1And2<
-        T, kNumThreads, kTileSize,
-        kTileSize><<<total_tiles_count, kNumThreads, 0, d.stream()>>>(
-        input, input_dims, output);
+    TilingSwapDim1And2<T, kNumThreads, kTileSize, kTileSize>
+        <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
+                                                            output);
 
   } else if (narrow_tile) {
     // If input shape is like Rect, such as 2X100, use Narrow tile size.
@@ -513,9 +516,9 @@ void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
     // If input shape is small, such as 8X8, just do simple copy
     int total_elements = input_dims[0] * input_dims[1] * input_dims[2];
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_elements);
-    TransposeSimpleKernel<T, 0, 2, 1><<<
-        config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
-        total_elements, input, input_dims, output);
+    TransposeSimpleKernel<T, 0, 2, 1>
+        <<<config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
+            total_elements, input, input_dims, output);
   }
 }
 
@@ -543,9 +546,9 @@ struct SwapDim0And2InTranspose {
     size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_size);
 
-    TransposeSimpleKernel<T, 2, 1, 0><<<
-        config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
-        total_size, in, input_dims, out);
+    TransposeSimpleKernel<T, 2, 1, 0>
+        <<<config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
+            total_size, in, input_dims, out);
   }
 };
 
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index a9e4876cc82..891aa312f69 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/transpose_op_mlu.cc b/paddle/fluid/operators/transpose_op_mlu.cc
index 40cb22bab50..38f6114e48d 100644
--- a/paddle/fluid/operators/transpose_op_mlu.cc
+++ b/paddle/fluid/operators/transpose_op_mlu.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/transpose_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/transpose_op_xpu.cc b/paddle/fluid/operators/transpose_op_xpu.cc
index 00a43c74d87..32b303238ab 100644
--- a/paddle/fluid/operators/transpose_op_xpu.cc
+++ b/paddle/fluid/operators/transpose_op_xpu.cc
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/transpose_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h
index afe5379dc3f..8c479076175 100644
--- a/paddle/fluid/operators/tree_conv_op.h
+++ b/paddle/fluid/operators/tree_conv_op.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <iostream>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/tree2col.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/fluid/operators/tril_indices_op.cc b/paddle/fluid/operators/tril_indices_op.cc
index be42f53dd23..63b5c1a2431 100644
--- a/paddle/fluid/operators/tril_indices_op.cc
+++ b/paddle/fluid/operators/tril_indices_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc
index b941fa3d03a..8ca83ed8810 100644
--- a/paddle/fluid/operators/tril_triu_op.cc
+++ b/paddle/fluid/operators/tril_triu_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index dc5a66dce16..21e2061e73b 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
+
 #include <limits>
 #include <random>
 #include <vector>
@@ -19,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 #include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
index 261d9cee2d5..363d909d84d 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
index 803b61fbe81..45a4b6a3bab 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
@@ -14,11 +14,12 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 #include <limits>
 #include <random>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unbind_op.cc b/paddle/fluid/operators/unbind_op.cc
index f2fc08308c6..739fc98f3f0 100644
--- a/paddle/fluid/operators/unbind_op.cc
+++ b/paddle/fluid/operators/unbind_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unbind_op.h"
+
 #include <string>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/fluid/operators/unbind_op.h b/paddle/fluid/operators/unbind_op.h
index 6e35f262de4..8e6cd391578 100644
--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 3e27402c869..a988c684389 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -16,10 +16,12 @@
 #include <algorithm>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/random.h>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
@@ -102,8 +104,9 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
           "Expected dtype of ShapeTensorList of %d-th must be int32, int64. "
           "But got "
           "unsupport dtype: %s.",
-          i, paddle::framework::DataTypeToString(
-                 framework::TransToProtoVarType(tensor->dtype()))));
+          i,
+          paddle::framework::DataTypeToString(
+              framework::TransToProtoVarType(tensor->dtype()))));
     }
   }
 
diff --git a/paddle/fluid/operators/uniform_random_op_mlu.cc b/paddle/fluid/operators/uniform_random_op_mlu.cc
index 2c5f13f5a93..fdf1252eb0d 100644
--- a/paddle/fluid/operators/uniform_random_op_mlu.cc
+++ b/paddle/fluid/operators/uniform_random_op_mlu.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/uniform_random_op.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/uniform_random_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/uniform_random_op_xpu.cc b/paddle/fluid/operators/uniform_random_op_xpu.cc
index ae2adf83419..23d0f61c2bd 100644
--- a/paddle/fluid/operators/uniform_random_op_xpu.cc
+++ b/paddle/fluid/operators/uniform_random_op_xpu.cc
@@ -14,11 +14,12 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/uniform_random_op.h"
 #include <string>
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/uniform_random_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unique_consecutive_op.cc b/paddle/fluid/operators/unique_consecutive_op.cc
index 24ef3a85ee2..567f7bac34b 100644
--- a/paddle/fluid/operators/unique_consecutive_op.cc
+++ b/paddle/fluid/operators/unique_consecutive_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unique_consecutive_op.h"
+
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/unique_consecutive_op.cu b/paddle/fluid/operators/unique_consecutive_op.cu
index fbffb01ed19..9db14e82b25 100644
--- a/paddle/fluid/operators/unique_consecutive_op.cu
+++ b/paddle/fluid/operators/unique_consecutive_op.cu
@@ -18,8 +18,10 @@ limitations under the License. */
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
 #include <thrust/unique.h>
+
 #include <iostream>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor_util.h"            // TensorToVector()
 #include "paddle/fluid/operators/unique_consecutive_op.h"  // TransComute()
 
diff --git a/paddle/fluid/operators/unique_consecutive_op.h b/paddle/fluid/operators/unique_consecutive_op.h
index b31c2aa67a5..4dc1871b5d1 100644
--- a/paddle/fluid/operators/unique_consecutive_op.h
+++ b/paddle/fluid/operators/unique_consecutive_op.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/transpose_op.h"
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
index 5c103e088b5..fbbd562c1b8 100644
--- a/paddle/fluid/operators/unique_op.cc
+++ b/paddle/fluid/operators/unique_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unique_op.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index 01439d21824..d59e6590a88 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/transpose_op.h"
diff --git a/paddle/fluid/operators/unique_with_counts_op.h b/paddle/fluid/operators/unique_with_counts_op.h
index af8bfe813a6..227fdef2224 100644
--- a/paddle/fluid/operators/unique_with_counts_op.h
+++ b/paddle/fluid/operators/unique_with_counts_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/unique_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 36e9d894541..b18c4e4de44 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unpool_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
index 35aeb4e0d61..062008f95ea 100644
--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/unpooling.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 445e8cd468b..82edcd5a9fc 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -101,9 +101,10 @@ class UnsqueezeOp : public framework::OperatorWithKernel {
     for (int axis : unsqz_dims) {
       int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
       // Vaildity Check: the axis bound
-      PADDLE_ENFORCE_GE(cur, 0, platform::errors::InvalidArgument(
-                                    "The insert dimension value should "
-                                    "not be less than 0"));
+      PADDLE_ENFORCE_GE(
+          cur, 0,
+          platform::errors::InvalidArgument("The insert dimension value should "
+                                            "not be less than 0"));
       PADDLE_ENFORCE_LE(cur, cur_output_size,
                         platform::errors::InvalidArgument(
                             "The insert dimension value shoud not be larger "
diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h
index f6112fb59c1..86038aced38 100644
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ b/paddle/fluid/operators/unsqueeze_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -72,9 +73,10 @@ class UnsqueezeKernel : public framework::OpKernel<T> {
     for (int axis : unsqz_dims) {
       int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
       // Vaildity Check: the axis bound
-      PADDLE_ENFORCE_GE(cur, 0, platform::errors::InvalidArgument(
-                                    "The insert dimension value should "
-                                    "not be less than 0"));
+      PADDLE_ENFORCE_GE(
+          cur, 0,
+          platform::errors::InvalidArgument("The insert dimension value should "
+                                            "not be less than 0"));
       PADDLE_ENFORCE_LE(cur, cur_output_size,
                         platform::errors::InvalidArgument(
                             "The insert dimension value shoule not be larger "
diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc
index 8c8684bf4b0..df2325f5dc5 100644
--- a/paddle/fluid/operators/unstack_op.cc
+++ b/paddle/fluid/operators/unstack_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/for_range.h"
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index d84f7b165fd..009e883ccb6 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <paddle/fluid/framework/operator.h>
+
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index 3dffa0be2e2..977cd99984c 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/var_conv_2d_op.h"
+
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/platform/dynload/mklml.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/platform/aligned_vector.h b/paddle/fluid/platform/aligned_vector.h
index 6d48917ba1f..b42ae15405e 100644
--- a/paddle/fluid/platform/aligned_vector.h
+++ b/paddle/fluid/platform/aligned_vector.h
@@ -43,11 +43,11 @@ HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
 }
 
 /*
-* Only the address of input data is the multiplier of 1,2,4, vectorized load
-* with corresponding multiplier-value is possible. Moreover, the maximum length
-* of vectorized load is 128 bits once. Hence, valid length of vectorized load
-* shall be determined under both former constraints.
-*/
+ * Only the address of input data is the multiplier of 1,2,4, vectorized load
+ * with corresponding multiplier-value is possible. Moreover, the maximum length
+ * of vectorized load is 128 bits once. Hence, valid length of vectorized load
+ * shall be determined under both former constraints.
+ */
 template <typename T>
 int GetVectorizedSize(const T* pointer) {
   constexpr int max_load_bits = 128;
@@ -58,11 +58,11 @@ int GetVectorizedSize(const T* pointer) {
   constexpr int vec2 = std::alignment_of<AlignedVector<T, 2>>::value;  // NOLINT
   if (address % vec8 == 0) {
     /*
-    * Currently, decide to deal with no more than 4 data once while adopting
-    * vectorization load/store, if performance test shows that dealing with
-    * 8 data once in vectorization load/store does get optimized, return code
-    * below can be changed into " return std::min(8, valid_vec_size); " .
-    */
+     * Currently, decide to deal with no more than 4 data once while adopting
+     * vectorization load/store, if performance test shows that dealing with
+     * 8 data once in vectorization load/store does get optimized, return code
+     * below can be changed into " return std::min(8, valid_vec_size); " .
+     */
     return std::min(4, valid_vec_size);
   } else if (address % vec4 == 0) {
     return std::min(4, valid_vec_size);
diff --git a/paddle/fluid/platform/bfloat16_test.cc b/paddle/fluid/platform/bfloat16_test.cc
index 794c1ff684c..f824716ab92 100644
--- a/paddle/fluid/platform/bfloat16_test.cc
+++ b/paddle/fluid/platform/bfloat16_test.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/bfloat16.h"
+
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu
index 391b91487fa..c5f38cf94ee 100644
--- a/paddle/fluid/platform/bfloat16_test.cu
+++ b/paddle/fluid/platform/bfloat16_test.cu
@@ -17,7 +17,9 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <iostream>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 
 #if defined(PADDLE_CUDA_BF16)
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index d05de900e5e..8f0e4204772 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/collective_helper.h"
+
 #include <utility>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
diff --git a/paddle/fluid/platform/complex_test.cc b/paddle/fluid/platform/complex_test.cc
index c7ded758717..3547631064d 100644
--- a/paddle/fluid/platform/complex_test.cc
+++ b/paddle/fluid/platform/complex_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/complex.h"
+
 #include <complex>
+
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
diff --git a/paddle/fluid/platform/complex_test.cu b/paddle/fluid/platform/complex_test.cu
index 08ec75878b8..b814bcde684 100644
--- a/paddle/fluid/platform/complex_test.cu
+++ b/paddle/fluid/platform/complex_test.cu
@@ -18,6 +18,7 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <thrust/complex.h>
+
 #include <bitset>
 #include <iostream>
 
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index e4860444865..c32af3b37a4 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #endif  // _WIN32
 
 #include <algorithm>
+
 #include "paddle/fluid/platform/flags.h"
 
 DECLARE_double(fraction_of_cpu_memory_to_use);
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
index c40a43dbfb8..4ef2a9709a5 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device_context.h"
 
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
index 81b68a5c678..b8831126be0 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.h
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
@@ -24,10 +24,9 @@ namespace paddle {
 namespace platform {
 
 #ifdef PADDLE_WITH_CUDA
-#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond, __kernel_func, __grid,     \
-                                           __block, __sm_size, __stream,      \
-                                           __seed_inc, __seed_expr,           \
-                                           __offset_expr, ...)                \
+#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(                                   \
+    __cond, __kernel_func, __grid, __block, __sm_size, __stream, __seed_inc,  \
+    __seed_expr, __offset_expr, ...)                                          \
   do {                                                                        \
     if (::paddle::platform::CUDAGraph::IsThisThreadCapturing() && (__cond)) { \
       using __Helper =                                                        \
@@ -36,9 +35,9 @@ namespace platform {
       auto *dev_ctx =                                                         \
           ::paddle::platform::DeviceContextPool::Instance().GetByPlace(       \
               ::paddle::platform::CUDAGraph::CapturingPlace());               \
-      auto __set_seed_func = [=](                                             \
-          ::paddle::platform::CUDAKernelParams *__params,                     \
-          bool __check_only) -> bool {                                        \
+      auto __set_seed_func =                                                  \
+          [=](::paddle::platform::CUDAKernelParams *__params,                 \
+              bool __check_only) -> bool {                                    \
         if (__check_only) {                                                   \
           return __params->func() == &__kernel_func &&                        \
                  __Helper::Compare(*__params, __VA_ARGS__);                   \
@@ -56,12 +55,11 @@ namespace platform {
     __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__);     \
   } while (0)
 #else
-#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(__cond, __kernel_func, __grid, \
-                                           __block, __sm_size, __stream,  \
-                                           __seed_inc, __seed_expr,       \
-                                           __offset_expr, ...)            \
-  do {                                                                    \
-    __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__); \
+#define PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(                                  \
+    __cond, __kernel_func, __grid, __block, __sm_size, __stream, __seed_inc, \
+    __seed_expr, __offset_expr, ...)                                         \
+  do {                                                                       \
+    __kernel_func<<<__grid, __block, __sm_size, __stream>>>(__VA_ARGS__);    \
   } while (0)
 #endif
 
diff --git a/paddle/fluid/platform/denormal.cc b/paddle/fluid/platform/denormal.cc
index 4af156d1577..4cfb0825443 100644
--- a/paddle/fluid/platform/denormal.cc
+++ b/paddle/fluid/platform/denormal.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/denormal.h"
+
 #include <tuple>
 #include <utility>
 
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
index ae6d90510f4..c5a515ce436 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+
 #include <queue>
 #include <unordered_map>
 #include <unordered_set>
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
index 083478ed7e6..b3704fc628a 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
@@ -20,10 +20,10 @@
 #include <mutex>
 #include <thread>
 #include <vector>
+
 #include "cuda.h"          // NOLINT
 #include "cuda_runtime.h"  // NOLINT
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
-
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index a32db3a9921..7185d2356aa 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -68,7 +68,7 @@ namespace platform {
  *      }
  *    }
  *
-*/
+ */
 
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
   int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc
index 851d0d18c60..86c72769eb5 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc
@@ -15,13 +15,13 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #define GOOGLE_GLOG_DLL_DECL
 
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
 TEST(CudnnHelper, ScopedTensorDescriptor) {
-  using paddle::platform::ScopedTensorDescriptor;
   using paddle::platform::DataLayout;
+  using paddle::platform::ScopedTensorDescriptor;
 
   ScopedTensorDescriptor tensor_desc;
   std::vector<int> shape = {2, 4, 6, 6};
@@ -65,8 +65,8 @@ TEST(CudnnHelper, ScopedTensorDescriptor) {
 }
 
 TEST(CudnnHelper, ScopedFilterDescriptor) {
-  using paddle::platform::ScopedFilterDescriptor;
   using paddle::platform::DataLayout;
+  using paddle::platform::ScopedFilterDescriptor;
 
   ScopedFilterDescriptor filter_desc;
   std::vector<int> shape = {2, 3, 3};
@@ -129,8 +129,8 @@ TEST(CudnnHelper, ScopedConvolutionDescriptor) {
 }
 
 TEST(CudnnHelper, ScopedPoolingDescriptor) {
-  using paddle::platform::ScopedPoolingDescriptor;
   using paddle::platform::PoolingMode;
+  using paddle::platform::ScopedPoolingDescriptor;
 
   ScopedPoolingDescriptor pool_desc;
   std::vector<int> src_kernel = {2, 2, 5};
diff --git a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
index ab8bb2cad8c..28c0e0ef9ac 100644
--- a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
+++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <iostream>
 #ifdef _WIN32
@@ -22,13 +23,12 @@
 
 #define PADDLE_CUDA_FP16
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
-
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 using paddle::platform::float16;
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T>
 __global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
diff --git a/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
index 8ea30027e8a..2e58e71cc2c 100644
--- a/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
+++ b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 8c04e935134..6b302d2449d 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <mutex>
 #include <set>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -100,8 +101,9 @@ static size_t GpuAllocSize(bool realloc) {
   size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
                            : FLAGS_initial_gpu_memory_in_mb;
   size_t alloc_bytes =
-      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
-                                           FLAGS_fraction_of_gpu_memory_to_use);
+      (flag_mb > 0ul
+           ? flag_mb << 20
+           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
   PADDLE_ENFORCE_GE(
       available_to_alloc, alloc_bytes,
       platform::errors::ResourceExhausted("Not enough available GPU memory."));
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
index 94b47cca948..3a97797c982 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include <stddef.h>
+
 #include <array>
 #include <string>
 #include <vector>
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index 80d60ca95bf..5cacdfcb12f 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -25,9 +25,11 @@
 #endif
 
 #include <stddef.h>
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/platform/device_context.h"
 
 #ifdef __HIPCC__
@@ -93,9 +95,9 @@ struct GpuLaunchConfig {
 };
 
 /* According to NVIDIA, if number of threads per block is 64/128/256/512,
-  * cuda performs better. And number of blocks should be greater (at least
-  * 2x~4x) than number of SMs. Hence, SM count is took into account within
-  * this function to determine the right number of threads per block. */
+ * cuda performs better. And number of blocks should be greater (at least
+ * 2x~4x) than number of SMs. Hence, SM count is took into account within
+ * this function to determine the right number of threads per block. */
 inline GpuLaunchConfig GetGpuLaunchConfig1D(
     const platform::CUDADeviceContext& context, int64_t numel,
     int vec_size = 1) {
@@ -143,14 +145,16 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(
 
 inline GpuLaunchConfig GetGpuLaunchConfig2D(
     const platform::CUDADeviceContext& context, int x_dim, int y_dim) {
-  PADDLE_ENFORCE_GT(x_dim, 0, platform::errors::InvalidArgument(
-                                  "x dim number should greater than 0,"
-                                  " but received value is: %d",
-                                  x_dim));
-  PADDLE_ENFORCE_GT(y_dim, 0, platform::errors::InvalidArgument(
-                                  "y dim number should greater than 0,"
-                                  " but received value is: %d",
-                                  y_dim));
+  PADDLE_ENFORCE_GT(
+      x_dim, 0,
+      platform::errors::InvalidArgument("x dim number should greater than 0,"
+                                        " but received value is: %d",
+                                        x_dim));
+  PADDLE_ENFORCE_GT(
+      y_dim, 0,
+      platform::errors::InvalidArgument("y dim number should greater than 0,"
+                                        " but received value is: %d",
+                                        y_dim));
 
   const int kThreadsPerBlock = 256;
   int block_cols = (std::min)(x_dim, kThreadsPerBlock);
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index 803674779e7..a0e9d459721 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #endif
 #include <stdio.h>
+
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
index 2c55eb972b7..56fdb0da340 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
@@ -14,6 +14,7 @@
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index d0b48eca502..2cadd55d2dc 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -19,11 +19,13 @@
 
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
+
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
 
 #else
 #include <cuda_runtime.h>
+
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index d0cb9c953a5..b9e612b98de 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -16,6 +16,7 @@
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include <stdio.h>
+
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT
@@ -264,7 +265,7 @@ class NCCLCommunicator {
    *allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
    *create a new nccl comm for sync_batch_norm_op. And these codes should be
    *polished with a unified nccl management.
-  */
+   */
   NCCLContextMap *GetSyncBatchNormCtx(
       framework::Scope *scope, const std::vector<platform::Place> &places) {
     auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc b/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc
index 13cf52dc2c6..e99fc7f37a8 100644
--- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc
@@ -15,13 +15,13 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #define GOOGLE_GLOG_DLL_DECL
 
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
 TEST(MIOpenHelper, ScopedTensorDescriptor) {
-  using paddle::platform::ScopedTensorDescriptor;
   using paddle::platform::DataLayout;
+  using paddle::platform::ScopedTensorDescriptor;
 
   ScopedTensorDescriptor tensor_desc;
   std::vector<int> shape = {2, 4, 6, 6};
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
index a0f3fb0f73b..c0f6f173a79 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
@@ -65,7 +65,7 @@ namespace platform {
  *      }
  *    }
  *
-*/
+ */
 
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
   int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
diff --git a/paddle/fluid/platform/device/ipu/ipu_device.cc b/paddle/fluid/platform/device/ipu/ipu_device.cc
index 2d0381cb8b3..f6de526c900 100644
--- a/paddle/fluid/platform/device/ipu/ipu_device.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_device.cc
@@ -45,9 +45,10 @@ int GetNumDevices() {
   }
   int num_devices =
       popart::DeviceManager::createDeviceManager().enumerateDevices().size();
-  PADDLE_ENFORCE_GT(num_devices, 0, platform::errors::Unavailable(
-                                        "Do not found any IPU devices, please "
-                                        "make sure Poplar sdk is enabled"));
+  PADDLE_ENFORCE_GT(
+      num_devices, 0,
+      platform::errors::Unavailable("Do not found any IPU devices, please "
+                                    "make sure Poplar sdk is enabled"));
   return num_devices;
 }
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index d490334ee33..30c9bc2094a 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -243,7 +243,8 @@ void Executor::AcquireDevice() {
     VLOG(10) << "Create IPU model device...";
     std::map<std::string, std::string> deviceOpts{
         {
-            "numIPUs", std::to_string(ipu_strategy_->num_ipus),
+            "numIPUs",
+            std::to_string(ipu_strategy_->num_ipus),
         },
         {"ipuVersion", "ipu2"},
     };
@@ -254,7 +255,8 @@ void Executor::AcquireDevice() {
     VLOG(10) << "Create offline device...";
     std::map<std::string, std::string> deviceOpts{
         {
-            "numIPUs", std::to_string(ipu_strategy_->num_ipus),
+            "numIPUs",
+            std::to_string(ipu_strategy_->num_ipus),
         },
         {"ipuVersion", "ipu2"},
     };
diff --git a/paddle/fluid/platform/device/ipu/ipu_info.h b/paddle/fluid/platform/device/ipu/ipu_info.h
index fe7076e0b50..06ef070ed65 100644
--- a/paddle/fluid/platform/device/ipu/ipu_info.h
+++ b/paddle/fluid/platform/device/ipu/ipu_info.h
@@ -13,6 +13,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_IPU
 #include <memory>
 #include <vector>
+
 #include "glog/logging.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index da08c76fb90..0e17a485afb 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <popart/patterns/patterns.hpp>
 #include <popart/sessionoptions.hpp>
 #include <popart/tensorlocation.hpp>
+
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -143,10 +144,11 @@ class IpuStrategy {
       std::map<std::string, std::function<void(ValueType)>> &options,  // NOLINT
       const std::string &type_str) {
     auto it = options.find(key);
-    PADDLE_ENFORCE_NE(it, options.end(), platform::errors::InvalidArgument(
-                                             "Cannot find option: %s, type: %s "
-                                             "when setting IpuStrategy options",
-                                             key, type_str));
+    PADDLE_ENFORCE_NE(
+        it, options.end(),
+        platform::errors::InvalidArgument("Cannot find option: %s, type: %s "
+                                          "when setting IpuStrategy options",
+                                          key, type_str));
     it->second(value);
   }
 
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
index 254e5665674..1d5fe8c329f 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
@@ -57,14 +57,14 @@ Node *gelu_handler(Graph *graph, Node *node) {
                              {{"value", std::vector<float>{1.4142135623730951}},
                               {"dims", std::vector<int64_t>{1}},
                               {"dtype", GetOutputVarDType(node)}});
-    auto zero_point_five =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0.5}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", GetOutputVarDType(node)}});
-    auto one =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{1}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", GetOutputVarDType(node)}});
+    auto zero_point_five = CreateConst(graph, node, {}, {},
+                                       {{"value", std::vector<float>{0.5}},
+                                        {"dims", std::vector<int64_t>{1}},
+                                        {"dtype", GetOutputVarDType(node)}});
+    auto one = CreateConst(graph, node, {}, {},
+                           {{"value", std::vector<float>{1}},
+                            {"dims", std::vector<int64_t>{1}},
+                            {"dtype", GetOutputVarDType(node)}});
     auto div =
         CreateBaseOp(graph, node, "popart_div",
                      {GetInputVarNode("X", node), sqrt2->outputs[0]}, {}, {});
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
index af72f84c9d7..9b91abc4a67 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -44,9 +44,10 @@ Node *pow_handler(Graph *graph, Node *node) {
         MakeConstAttrMapFromValue<float>(value_, {1}, GetOutputVarDType(node));
 
     auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
-    return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node),
-                                                    new_node_const->outputs[0]},
-                        node->outputs);
+    return CreateBaseOp(
+        graph, node, "popart_pow",
+        {GetInputVarNode("X", node), new_node_const->outputs[0]},
+        node->outputs);
   }
 }
 
@@ -380,10 +381,10 @@ Node *cumsum_handler(Graph *graph, Node *node) {
   auto reverse = BOOST_GET_CONST(bool, op->GetAttr("reverse"));
   int64_t popart_reverse = 1 ? reverse : 0;
   auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
-  auto axis_node =
-      CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{axis}},
-                                        {"dims", std::vector<int64_t>{1}},
-                                        {"dtype", ONNXDataType::INT64}});
+  auto axis_node = CreateConst(graph, node, {}, {},
+                               {{"value", std::vector<int64_t>{axis}},
+                                {"dims", std::vector<int64_t>{1}},
+                                {"dtype", ONNXDataType::INT64}});
   return CreateBaseOp(
       graph, node, "popart_cumsum",
       {GetInputVarNode("X", node), axis_node->outputs[0]},
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
index 2e9913f58ef..bce6bac88e2 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -35,20 +35,20 @@ Node *conv2d_handler(Graph *graph, Node *node) {
   auto stride_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
   auto stride = std::vector<int64_t>{stride_.begin(), stride_.end()};
   if (!op->Input("Bias").empty()) {
-    return CreateConv(
-        graph, node,
-        {
-            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
-            GetInputVarNode("Bias", node),
-        },
-        node->outputs, dilations, group_, {}, pads, stride);
+    return CreateConv(graph, node,
+                      {
+                          GetInputVarNode("Input", node),
+                          GetInputVarNode("Filter", node),
+                          GetInputVarNode("Bias", node),
+                      },
+                      node->outputs, dilations, group_, {}, pads, stride);
   } else {
-    return CreateConv(
-        graph, node,
-        {
-            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
-        },
-        node->outputs, dilations, group_, {}, pads, stride);
+    return CreateConv(graph, node,
+                      {
+                          GetInputVarNode("Input", node),
+                          GetInputVarNode("Filter", node),
+                      },
+                      node->outputs, dilations, group_, {}, pads, stride);
   }
 }
 
@@ -148,15 +148,16 @@ Node *pool2d_handler(Graph *graph, Node *node) {
     auto dilations = std::vector<int64_t>{};
     int64_t storage_order = 0;
     return CreateBaseOp(graph, node, "popart_maxpool", node->inputs,
-                        node->outputs, {
-                                           {"num_outputs", num_outputs},
-                                           {"kernel_shape", kernel_shape},
-                                           {"ceil_mode", ceil_mode},
-                                           {"dilations", dilations},
-                                           {"pads", pads},
-                                           {"storage_order", storage_order},
-                                           {"strides", strides},
-                                       });
+                        node->outputs,
+                        {
+                            {"num_outputs", num_outputs},
+                            {"kernel_shape", kernel_shape},
+                            {"ceil_mode", ceil_mode},
+                            {"dilations", dilations},
+                            {"pads", pads},
+                            {"storage_order", storage_order},
+                            {"strides", strides},
+                        });
   } else if (pooling_type == "avg") {
     int64_t count_include_pad = 0;
     return CreateBaseOp(graph, node, "popart_averagepool", node->inputs,
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
index 0525bb66f16..b51d923bfcf 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
@@ -173,8 +173,9 @@ Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
 Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
                            const std::vector<Node *> &inputs,
                            const std::vector<Node *> &outputs, int64_t axis) {
-  PADDLE_ENFORCE_EQ(inputs.size(), 1, platform::errors::InvalidArgument(
-                                          "Softmax op only support one input"));
+  PADDLE_ENFORCE_EQ(
+      inputs.size(), 1,
+      platform::errors::InvalidArgument("Softmax op only support one input"));
   auto x_shape = inputs[0]->Var()->GetShape();
   int x_rank = x_shape.size();
   if (axis < 0) {
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
index aec89a1cf0d..77ce2f31669 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
@@ -69,10 +69,10 @@ Node *topk_handler(Graph *graph, Node *node) {
     var_k = GetInputVarNode("K", node);
   } else {
     auto k = BOOST_GET_CONST(int, op->GetAttr("k"));
-    auto *op_k =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{k}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", ONNXDataType::INT64}});
+    auto *op_k = CreateConst(graph, node, {}, {},
+                             {{"value", std::vector<int64_t>{k}},
+                              {"dims", std::vector<int64_t>{1}},
+                              {"dtype", ONNXDataType::INT64}});
     var_k = op_k->outputs[0];
   }
 
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index 00926ee7a0b..bf32744d5a5 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -61,7 +61,9 @@ Node *fill_constant_handler(Graph *graph, Node *node) {
   }
   return CreateConst(graph, node, node->inputs, node->outputs,
                      AttributeMap{
-                         {"value", value}, {"dims", dims}, {"dtype", dtype},
+                         {"value", value},
+                         {"dims", dims},
+                         {"dtype", dtype},
                      });
 }
 
@@ -76,13 +78,14 @@ Node *gaussian_random_handler(Graph *graph, Node *node) {
   auto seed_ = BOOST_GET_CONST(int, op->GetAttr("seed"));
   auto seed = static_cast<float>(seed_);
   return CreateBaseOp(graph, node, "popart_randomnormal", node->inputs,
-                      node->outputs, {
-                                         {"shape", shape},
-                                         {"dtype", dtype},
-                                         {"mean", mean},
-                                         {"scale", scale},
-                                         {"seed", seed},
-                                     });
+                      node->outputs,
+                      {
+                          {"shape", shape},
+                          {"dtype", dtype},
+                          {"mean", mean},
+                          {"scale", scale},
+                          {"seed", seed},
+                      });
 }
 
 Node *uniform_random_handler(Graph *graph, Node *node) {
@@ -96,13 +99,14 @@ Node *uniform_random_handler(Graph *graph, Node *node) {
   auto seed_ = BOOST_GET_CONST(int, op->GetAttr("seed"));
   auto seed = static_cast<float>(seed_);
   return CreateBaseOp(graph, node, "popart_randomuniform", node->inputs,
-                      node->outputs, {
-                                         {"shape", shape},
-                                         {"dtype", dtype},
-                                         {"high", high},
-                                         {"low", low},
-                                         {"seed", seed},
-                                     });
+                      node->outputs,
+                      {
+                          {"shape", shape},
+                          {"dtype", dtype},
+                          {"high", high},
+                          {"low", low},
+                          {"seed", seed},
+                      });
 }
 
 Node *transpose_handler(Graph *graph, Node *node) {
@@ -204,32 +208,33 @@ Node *lookup_table_op_handler(Graph *graph, Node *node,
   if (padding_idx_ >= 0 && padding_idx_ < table_size_) {
     std::vector<float> const_value_(emb_size_, 0);
     std::vector<int64_t> const_shape_{1, emb_size_};
-    auto concat_const =
-        CreateConst(graph, node, {}, {}, {{"value", const_value_},
-                                          {"dims", const_shape_},
-                                          {"dtype", GetOutputVarDType(node)}});
-    auto axes =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{0}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", ONNXDataType::INT64}});
-    auto step =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{1}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", ONNXDataType::INT64}});
-
-    auto left_start =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{0}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", ONNXDataType::INT64}});
+    auto concat_const = CreateConst(graph, node, {}, {},
+                                    {{"value", const_value_},
+                                     {"dims", const_shape_},
+                                     {"dtype", GetOutputVarDType(node)}});
+    auto axes = CreateConst(graph, node, {}, {},
+                            {{"value", std::vector<int64_t>{0}},
+                             {"dims", std::vector<int64_t>{1}},
+                             {"dtype", ONNXDataType::INT64}});
+    auto step = CreateConst(graph, node, {}, {},
+                            {{"value", std::vector<int64_t>{1}},
+                             {"dims", std::vector<int64_t>{1}},
+                             {"dtype", ONNXDataType::INT64}});
+
+    auto left_start = CreateConst(graph, node, {}, {},
+                                  {{"value", std::vector<int64_t>{0}},
+                                   {"dims", std::vector<int64_t>{1}},
+                                   {"dtype", ONNXDataType::INT64}});
     auto left_end = CreateConst(graph, node, {}, {},
                                 {{"value", std::vector<int64_t>{padding_idx_}},
                                  {"dims", std::vector<int64_t>{1}},
                                  {"dtype", ONNXDataType::INT64}});
 
-    auto right_start = CreateConst(
-        graph, node, {}, {}, {{"value", std::vector<int64_t>{padding_idx_ + 1}},
-                              {"dims", std::vector<int64_t>{1}},
-                              {"dtype", ONNXDataType::INT64}});
+    auto right_start =
+        CreateConst(graph, node, {}, {},
+                    {{"value", std::vector<int64_t>{padding_idx_ + 1}},
+                     {"dims", std::vector<int64_t>{1}},
+                     {"dtype", ONNXDataType::INT64}});
     auto right_end = CreateConst(graph, node, {}, {},
                                  {{"value", std::vector<int64_t>{table_size_}},
                                   {"dims", std::vector<int64_t>{1}},
@@ -471,7 +476,9 @@ Node *assign_value_handler(Graph *graph, Node *node) {
   }
   return CreateConst(graph, node, node->inputs, node->outputs,
                      AttributeMap{
-                         {"value", values}, {"dims", dims}, {"dtype", dtype},
+                         {"value", values},
+                         {"dims", dims},
+                         {"dtype", dtype},
                      });
 }
 
@@ -529,10 +536,10 @@ Node *one_hot_handler(Graph *graph, Node *node) {
                                     {{"value", std::vector<int64_t>{depth}},
                                      {"dims", std::vector<int64_t>{1}},
                                      {"dtype", ONNXDataType::INT64}});
-    auto value_tensor =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
-                                          {"dims", std::vector<int64_t>{2}},
-                                          {"dtype", ONNXDataType::FLOAT}});
+    auto value_tensor = CreateConst(graph, node, {}, {},
+                                    {{"value", std::vector<float>{0, 1}},
+                                     {"dims", std::vector<int64_t>{2}},
+                                     {"dtype", ONNXDataType::FLOAT}});
     return CreateBaseOp(graph, node, "popart_onehot",
                         {GetInputVarNode("X", node), depth_tensor->outputs[0],
                          value_tensor->outputs[0]},
@@ -550,21 +557,21 @@ Node *one_hot_v2_handler(Graph *graph, Node *node) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Do not support allow_out_of_range=True"));
   } else {
-    auto depth_tensor =
-        CreateConst(graph, node, {}, {}, {{"value", std::vector<int>{depth}},
-                                          {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", ONNXDataType::INT32}});
+    auto depth_tensor = CreateConst(graph, node, {}, {},
+                                    {{"value", std::vector<int>{depth}},
+                                     {"dims", std::vector<int64_t>{1}},
+                                     {"dtype", ONNXDataType::INT32}});
     Node *value_tensor = nullptr;
     if (GetOutputVarNode("Out", node)->Var()->GetDataType() == VarType::FP16) {
-      value_tensor =
-          CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
-                                            {"dims", std::vector<int64_t>{2}},
-                                            {"dtype", ONNXDataType::FLOAT16}});
+      value_tensor = CreateConst(graph, node, {}, {},
+                                 {{"value", std::vector<float>{0, 1}},
+                                  {"dims", std::vector<int64_t>{2}},
+                                  {"dtype", ONNXDataType::FLOAT16}});
     } else {
-      value_tensor =
-          CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
-                                            {"dims", std::vector<int64_t>{2}},
-                                            {"dtype", ONNXDataType::FLOAT}});
+      value_tensor = CreateConst(graph, node, {}, {},
+                                 {{"value", std::vector<float>{0, 1}},
+                                  {"dims", std::vector<int64_t>{2}},
+                                  {"dtype", ONNXDataType::FLOAT}});
     }
 
     return CreateBaseOp(graph, node, "popart_onehot",
diff --git a/paddle/fluid/platform/device/mlu/cncl_helper.h b/paddle/fluid/platform/device/mlu/cncl_helper.h
index 2f9bed01426..634e420d5ce 100644
--- a/paddle/fluid/platform/device/mlu/cncl_helper.h
+++ b/paddle/fluid/platform/device/mlu/cncl_helper.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CNCL
 #include <cncl.h>
-
 #include <stdio.h>
+
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT
diff --git a/paddle/fluid/platform/device/mlu/device_context.h b/paddle/fluid/platform/device/mlu/device_context.h
index 120916b4f5c..d607b1e12f5 100644
--- a/paddle/fluid/platform/device/mlu/device_context.h
+++ b/paddle/fluid/platform/device/mlu/device_context.h
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_MLU
 #include <mutex>
+
 #include "paddle/fluid/platform/device/mlu/enforce.h"
 #include "paddle/fluid/platform/device/mlu/mlu_stream.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/platform/device/mlu/device_context_test.cc b/paddle/fluid/platform/device/mlu/device_context_test.cc
index 5caaa9dec1e..41f79c7092e 100644
--- a/paddle/fluid/platform/device/mlu/device_context_test.cc
+++ b/paddle/fluid/platform/device/mlu/device_context_test.cc
@@ -20,9 +20,9 @@ limitations under the License. */
 
 TEST(Device, Init) {
   using paddle::platform::DeviceContext;
+  using paddle::platform::MLUContext;
   using paddle::platform::MLUDeviceContext;
   using paddle::platform::MLUPlace;
-  using paddle::platform::MLUContext;
 
   int count = paddle::platform::GetMLUDeviceCount();
   for (int i = 0; i < count; i++) {
@@ -34,9 +34,9 @@ TEST(Device, Init) {
 }
 
 TEST(Device, MLUDeviceContext) {
+  using paddle::mluCnnlHandle;
   using paddle::platform::MLUDeviceContext;
   using paddle::platform::MLUPlace;
-  using paddle::mluCnnlHandle;
 
   int count = paddle::platform::GetMLUDeviceCount();
   for (int i = 0; i < count; i++) {
@@ -48,9 +48,9 @@ TEST(Device, MLUDeviceContext) {
 }
 
 TEST(Device, MLUStream) {
+  using paddle::mluStream;
   using paddle::platform::MLUDeviceContext;
   using paddle::platform::MLUPlace;
-  using paddle::mluStream;
 
   int count = paddle::platform::GetMLUDeviceCount();
   for (int i = 0; i < count; i++) {
@@ -62,11 +62,11 @@ TEST(Device, MLUStream) {
 }
 
 TEST(Device, DeviceContextPool) {
+  using paddle::platform::CPUPlace;
   using paddle::platform::DeviceContextPool;
   using paddle::platform::MLUDeviceContext;
-  using paddle::platform::Place;
-  using paddle::platform::CPUPlace;
   using paddle::platform::MLUPlace;
+  using paddle::platform::Place;
 
   DeviceContextPool& pool = DeviceContextPool::Instance();
   auto cpu_dev_ctx1 = pool.Get(CPUPlace());
diff --git a/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc b/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc
index 7708267c1bc..4051caac1c8 100644
--- a/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc
+++ b/paddle/fluid/platform/device/mlu/mlu_collective_helper.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_CNCL)
 #include <utility>
+
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/mlu/enforce.h"
 
diff --git a/paddle/fluid/platform/device/mlu/mlu_info.cc b/paddle/fluid/platform/device/mlu/mlu_info.cc
index 7cad99bf5d2..e3672707210 100644
--- a/paddle/fluid/platform/device/mlu/mlu_info.cc
+++ b/paddle/fluid/platform/device/mlu/mlu_info.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
+
 #include <mutex>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/mlu/enforce.h"
@@ -187,8 +189,9 @@ static size_t MLUAllocSize(bool realloc) {
   size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
                            : FLAGS_initial_gpu_memory_in_mb;
   size_t alloc_bytes =
-      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
-                                           FLAGS_fraction_of_gpu_memory_to_use);
+      (flag_mb > 0ul
+           ? flag_mb << 20
+           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
   PADDLE_ENFORCE_GE(
       available_to_alloc, alloc_bytes,
       platform::errors::ResourceExhausted("Not enough available MLU memory."));
diff --git a/paddle/fluid/platform/device/mlu/mlu_stream.cc b/paddle/fluid/platform/device/mlu/mlu_stream.cc
index 7a27a49250a..f570cc77e5a 100644
--- a/paddle/fluid/platform/device/mlu/mlu_stream.cc
+++ b/paddle/fluid/platform/device/mlu/mlu_stream.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/mlu/mlu_stream.h"
+
 #include "paddle/fluid/platform/device/mlu/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/npu/ascend_npu_info.cc b/paddle/fluid/platform/device/npu/ascend_npu_info.cc
index c100b2d0a17..a9204ac3fca 100644
--- a/paddle/fluid/platform/device/npu/ascend_npu_info.cc
+++ b/paddle/fluid/platform/device/npu/ascend_npu_info.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device/npu/ascend_npu_info.h"
+
 #include <glog/logging.h>
+
 #include "acl/acl_rt.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/npu/dynload/hccl.h b/paddle/fluid/platform/device/npu/dynload/hccl.h
index 3d7587bfa26..ae140dd2950 100644
--- a/paddle/fluid/platform/device/npu/dynload/hccl.h
+++ b/paddle/fluid/platform/device/npu/dynload/hccl.h
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include <hccl/hccl.h>
 #include <hccl/hccl_types.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
diff --git a/paddle/fluid/platform/device/npu/enforce_npu.h b/paddle/fluid/platform/device/npu/enforce_npu.h
index 3887ee4866a..24392686863 100644
--- a/paddle/fluid/platform/device/npu/enforce_npu.h
+++ b/paddle/fluid/platform/device/npu/enforce_npu.h
@@ -17,10 +17,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_ASCEND_CL
 #include <string>
 
-#include "paddle/fluid/platform/enforce.h"
-
 #include "acl/acl.h"
 #include "hccl/hccl_types.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h
index 134ec04030d..107fe5989dd 100644
--- a/paddle/fluid/platform/device/npu/hccl_helper.h
+++ b/paddle/fluid/platform/device/npu/hccl_helper.h
@@ -17,6 +17,7 @@
 #ifdef PADDLE_WITH_ASCEND_CL
 
 #include <stdio.h>
+
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT
@@ -24,11 +25,10 @@
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
-#include "paddle/fluid/platform/device/npu/enforce_npu.h"
-
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/dynload/hccl.h"
+#include "paddle/fluid/platform/device/npu/enforce_npu.h"
 #include "paddle/fluid/platform/float16.h"
 
 #define HCCL_ID_VARNAME "HCCLID"
diff --git a/paddle/fluid/platform/device/npu/npu_collective_helper.cc b/paddle/fluid/platform/device/npu/npu_collective_helper.cc
index cdec3519a23..77528fe19fc 100644
--- a/paddle/fluid/platform/device/npu/npu_collective_helper.cc
+++ b/paddle/fluid/platform/device/npu/npu_collective_helper.cc
@@ -14,6 +14,7 @@
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include <utility>
+
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
 
diff --git a/paddle/fluid/platform/device/npu/npu_info.cc b/paddle/fluid/platform/device/npu/npu_info.cc
index b5516944b75..2688c88f557 100644
--- a/paddle/fluid/platform/device/npu/npu_info.cc
+++ b/paddle/fluid/platform/device/npu/npu_info.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/npu/npu_info.h"
+
 #include <algorithm>
 #include <cstdlib>
 #include <memory>
 
 #include "gflags/gflags.h"
-
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/monitor.h"
@@ -153,8 +153,9 @@ static size_t NPUAllocSize(bool realloc) {
   size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
                            : FLAGS_initial_gpu_memory_in_mb;
   size_t alloc_bytes =
-      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
-                                           FLAGS_fraction_of_gpu_memory_to_use);
+      (flag_mb > 0ul
+           ? flag_mb << 20
+           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
   PADDLE_ENFORCE_GE(
       available_to_alloc, alloc_bytes,
       platform::errors::ResourceExhausted("Not enough available NPU memory."));
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.cc b/paddle/fluid/platform/device/npu/npu_op_runner.cc
index 72169ae303b..d38443acca3 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.cc
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 
 #include "acl/acl.h"
 #include "acl/acl_op_compiler.h"
-
 #include "paddle/fluid/framework/framework.pb.h"
 
 DECLARE_string(npu_precision_mode);
diff --git a/paddle/fluid/platform/device/npu/npu_resource_pool.cc b/paddle/fluid/platform/device/npu/npu_resource_pool.cc
index d837e90c3c4..e7c302289db 100644
--- a/paddle/fluid/platform/device/npu/npu_resource_pool.cc
+++ b/paddle/fluid/platform/device/npu/npu_resource_pool.cc
@@ -14,6 +14,7 @@
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_resource_pool.h"
+
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/npu/npu_stream.cc b/paddle/fluid/platform/device/npu/npu_stream.cc
index 0b15a0d937e..55a73146815 100644
--- a/paddle/fluid/platform/device/npu/npu_stream.cc
+++ b/paddle/fluid/platform/device/npu/npu_stream.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/xpu/bkcl_helper.h b/paddle/fluid/platform/device/xpu/bkcl_helper.h
index 24fd8b5faa4..a7a3e4f0605 100644
--- a/paddle/fluid/platform/device/xpu/bkcl_helper.h
+++ b/paddle/fluid/platform/device/xpu/bkcl_helper.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <stdio.h>
+
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT
@@ -217,7 +218,7 @@ class BKCLCommunicator {
    *bkcl_all_reduce
    *parallelly. So create a new bkcl comm for sync_batch_norm_op. And these
    *codes should be polished with a unified bkcl management.
-  */
+   */
   BKCLContextMap *GetSyncBatchNormCtx(
       framework::Scope *scope, const std::vector<platform::Place> &places) {
     auto *bkcl_id_var = scope->FindVar(BKCL_ID_VARNAME);
diff --git a/paddle/fluid/platform/device/xpu/enforce_xpu.h b/paddle/fluid/platform/device/xpu/enforce_xpu.h
index c55d91c3015..77d14aa712e 100644
--- a/paddle/fluid/platform/device/xpu/enforce_xpu.h
+++ b/paddle/fluid/platform/device/xpu/enforce_xpu.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
-
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc
index 8cba98f3fb3..0b528c3999e 100644
--- a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc
+++ b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
+
 #include "gtest/gtest.h"
 
 template <typename T>
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index cdd7ee7f806..dbc8ed4a51a 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -13,14 +13,13 @@ limitations under the License. */
 #include <algorithm>
 #include <cstdlib>
 #include <string>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
-
 #include "paddle/phi/backends/xpu/xpu_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h
index 38b4defadc6..2dd0f327530 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 #include <vector>
+
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
 #include "xpu/runtime.h"
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 07385143362..8ace4d1a32c 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -9,6 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
+
 #include <mutex>
 #include <string>
 #include <unordered_set>
@@ -17,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/xpu2_op_list.h"
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h"
-#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc
index a4226dabf9d..4ee32ad5a03 100644
--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/device_code.h"
+
 #include <sys/stat.h>
+
 #include <algorithm>
 #include <set>
 #include <utility>
 
-#include "paddle/fluid/platform/device_code.h"
 #include "paddle/fluid/platform/enforce.h"
 
 DECLARE_string(cuda_dir);
diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
index 7da8c561385..cb2649686ec 100644
--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device_code.h"
+
 #include <utility>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/init.h"
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 09a29c3429c..0bd606257f5 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
+
 #include <functional>
 #include <memory>
 #include <set>
+
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index a63d41405f1..d0dae706ba5 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -21,13 +21,12 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/device_context.h"
-
-#include "paddle/fluid/memory/malloc.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index 08a04a9565a..2db29dc11ad 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -11,18 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/device_context.h"
-
 #include <vector>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/platform/device_context.h"
 
 TEST(Device, Init) {
-  using paddle::platform::DeviceContext;
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
+  using paddle::platform::DeviceContext;
 
   int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
@@ -94,11 +93,11 @@ TEST(Device, CUDADeviceContext) {
 }
 
 TEST(Device, DeviceContextPool) {
-  using paddle::platform::DeviceContextPool;
-  using paddle::platform::CUDADeviceContext;
-  using paddle::platform::Place;
   using paddle::platform::CPUPlace;
+  using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::Place;
 
   DeviceContextPool& pool = DeviceContextPool::Instance();
   auto cpu_dev_ctx1 = pool.Get(CPUPlace());
diff --git a/paddle/fluid/platform/device_context_xpu_test.cc b/paddle/fluid/platform/device_context_xpu_test.cc
index 3de2e3957a9..50cb0f98d33 100644
--- a/paddle/fluid/platform/device_context_xpu_test.cc
+++ b/paddle/fluid/platform/device_context_xpu_test.cc
@@ -11,12 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/device_context.h"
-
 #include <vector>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/platform/device_context.h"
 
 TEST(Device, Init) {
   using paddle::platform::DeviceContext;
@@ -33,10 +32,10 @@ TEST(Device, Init) {
 }
 
 TEST(Device, DeviceContextPool) {
+  using paddle::platform::CPUPlace;
   using paddle::platform::DeviceContextPool;
-  using paddle::platform::XPUDeviceContext;
   using paddle::platform::Place;
-  using paddle::platform::CPUPlace;
+  using paddle::platform::XPUDeviceContext;
   using paddle::platform::XPUPlace;
 
   DeviceContextPool& pool = DeviceContextPool::Instance();
diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h
index 463329d32c9..82d93dee398 100644
--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -23,8 +23,8 @@
  *  for USE_PASS from pass_library.
  */
 
-using ::paddle::platform::kCUDA;
 using ::paddle::platform::kCPU;
+using ::paddle::platform::kCUDA;
 
 USE_EVENT(kCPU)
 USE_EVENT_WAIT(kCPU, kCPU)
diff --git a/paddle/fluid/platform/device_event_base.cc b/paddle/fluid/platform/device_event_base.cc
index 67fad3857f2..374de7d923f 100644
--- a/paddle/fluid/platform/device_event_base.cc
+++ b/paddle/fluid/platform/device_event_base.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/device_event_base.h"
+
 #include "paddle/fluid/platform/device_event_cpu.h"
 #include "paddle/fluid/platform/event.h"
 
diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h
index 8fe5ef9fcb1..4e751aa6d13 100644
--- a/paddle/fluid/platform/device_event_base.h
+++ b/paddle/fluid/platform/device_event_base.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 #include <memory>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/platform/device_event_cpu.h b/paddle/fluid/platform/device_event_cpu.h
index 6e2bf4c7ad1..1620dffdabd 100644
--- a/paddle/fluid/platform/device_event_cpu.h
+++ b/paddle/fluid/platform/device_event_cpu.h
@@ -16,6 +16,7 @@
 #include <atomic>
 #include <condition_variable>
 #include <mutex>
+
 #include "paddle/fluid/platform/device_event_base.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc
index f42ccc5a1db..f176d1a0d5d 100644
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -101,8 +101,8 @@ void EventResetCUDA(const DeviceEvent* event) {
 }  // namespace platform
 }  // namespace paddle
 
-using ::paddle::platform::kCUDA;
 using ::paddle::platform::kCPU;
+using ::paddle::platform::kCUDA;
 REGISTER_EVENT_CREATE_FUNCTION(kCUDA, paddle::platform::DeviceEventCreateCUDA)
 REGISTER_EVENT_RECORD_FUNCTION(kCUDA, paddle::platform::DeviceEventRecordCUDA)
 REGISTER_EVENT_QUERY_FUNCTION(kCUDA, paddle::platform::DeviceEventQueryCUDA)
diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc
index d9f744b2625..92fe7c02bd0 100644
--- a/paddle/fluid/platform/device_event_test.cc
+++ b/paddle/fluid/platform/device_event_test.cc
@@ -13,15 +13,16 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/device_event.h"
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/place.h"
 
-using ::paddle::platform::kCUDA;
 using ::paddle::platform::kCPU;
+using ::paddle::platform::kCUDA;
 
-using paddle::platform::DeviceEvent;
 using paddle::platform::DeviceContextPool;
+using paddle::platform::DeviceEvent;
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 73847ce24aa..fa345ed31cb 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/device_tracer.h"
+
 #include <deque>
 #include <forward_list>
 #include <fstream>
@@ -20,7 +22,6 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "glog/logging.h"
-#include "paddle/fluid/platform/device_tracer.h"
 
 DECLARE_bool(enable_host_event_recorder_hook);
 
@@ -255,7 +256,9 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
             }
             break;
           }
-          default: { break; }
+          default: {
+            break;
+          }
         }
       } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
         // Seems not an error in this case.
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index d7d43cecc25..496b253dff5 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cublasXt.h>
 #include <cublas_v2.h>
 #include <cuda.h>
+
 #include <mutex>  // NOLINT
 #include <type_traits>
 
diff --git a/paddle/fluid/platform/dynload/cublasLt.h b/paddle/fluid/platform/dynload/cublasLt.h
index 5157cfdad2e..3a1d28072c5 100644
--- a/paddle/fluid/platform/dynload/cublasLt.h
+++ b/paddle/fluid/platform/dynload/cublasLt.h
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include <cublasLt.h>
 #include <cuda.h>
+
 #include <mutex>  // NOLINT
 #include <type_traits>
 
diff --git a/paddle/fluid/platform/dynload/cuda_driver.cc b/paddle/fluid/platform/dynload/cuda_driver.cc
index a0f9647f089..c6851594b80 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.cc
+++ b/paddle/fluid/platform/dynload/cuda_driver.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
+
 #include "paddle/phi/backends/dynload/cuda_driver.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h
index f5550e9f9fe..b696ffc1a3b 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cuda.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/cuda_driver.h"
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 553792d3bbf..05cacb74c86 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/cudnn.h"
+
 #include "paddle/phi/backends/dynload/cudnn.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index bf4bb08a696..9af1e8065c4 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
 #include <glog/logging.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/cudnn.h"
diff --git a/paddle/fluid/platform/dynload/cufft.cc b/paddle/fluid/platform/dynload/cufft.cc
index 1996ab16167..6a06c4bdb6a 100644
--- a/paddle/fluid/platform/dynload/cufft.cc
+++ b/paddle/fluid/platform/dynload/cufft.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/cufft.h"
+
 #include "paddle/phi/backends/dynload/cufft.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/cufft.h b/paddle/fluid/platform/dynload/cufft.h
index 6c3a0992d75..d79603a5a01 100644
--- a/paddle/fluid/platform/dynload/cufft.h
+++ b/paddle/fluid/platform/dynload/cufft.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cufft.h>
 #include <cufftXt.h>
 #include <glog/logging.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/cufft.h"
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index 854e5a7b9f0..8e08785f209 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cuda.h>
 #include <cuda_occupancy.h>
 #include <cupti.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/cupti.h"
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 1fdd9240284..f4065a196d3 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <curand.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/curand.h"
diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h
index 212c350ebb2..854de23150c 100644
--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cusolverDn.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/cusolver.h"
diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h
index b4b93521678..925852bb415 100644
--- a/paddle/fluid/platform/dynload/cusparse.h
+++ b/paddle/fluid/platform/dynload/cusparse.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cusparse.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/cusparse.h"
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 5ce63b244ef..2f24e1b87da 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "gflags/gflags.h"
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 
diff --git a/paddle/fluid/platform/dynload/hiprtc.cc b/paddle/fluid/platform/dynload/hiprtc.cc
index 6c4a4bfd0de..d9bb3fd2c42 100644
--- a/paddle/fluid/platform/dynload/hiprtc.cc
+++ b/paddle/fluid/platform/dynload/hiprtc.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/hiprtc.h"
+
 #include "paddle/phi/backends/dynload/hiprtc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/hiprtc.h b/paddle/fluid/platform/dynload/hiprtc.h
index 851dadbac63..f27d5d808f7 100644
--- a/paddle/fluid/platform/dynload/hiprtc.h
+++ b/paddle/fluid/platform/dynload/hiprtc.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <hip/hiprtc.h>
+
 #include <mutex>  // NOLINT
+
 #include "paddle/phi/backends/dynload/hiprtc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/miopen.cc b/paddle/fluid/platform/dynload/miopen.cc
index 9660188b68d..15012531b4c 100644
--- a/paddle/fluid/platform/dynload/miopen.cc
+++ b/paddle/fluid/platform/dynload/miopen.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/miopen.h"
+
 #include "paddle/phi/backends/dynload/cudnn.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index b99cd5ebb6e..20b92b17051 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -14,10 +14,11 @@ limitations under the License. */
 
 #pragma once
 #include <glog/logging.h>
-
 #include <miopen/miopen.h>
 #include <miopen/version.h>
+
 #include <mutex>  // NOLINT
+
 #include "paddle/phi/backends/dynload/miopen.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 1c7d0c17a0f..78cae9a0821 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <mkl.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/mklml.h"
diff --git a/paddle/fluid/platform/dynload/mklrt.h b/paddle/fluid/platform/dynload/mklrt.h
index 334b98a1c3d..e1a2bedfa8e 100644
--- a/paddle/fluid/platform/dynload/mklrt.h
+++ b/paddle/fluid/platform/dynload/mklrt.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <mkl_dfti.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index a38d1d4272e..c2052719dd5 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <nccl.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/nccl.h"
diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h
index 8aaf672fe67..026a3b64886 100644
--- a/paddle/fluid/platform/dynload/nvjpeg.h
+++ b/paddle/fluid/platform/dynload/nvjpeg.h
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 #include <nvjpeg.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/nvjpeg.h"
diff --git a/paddle/fluid/platform/dynload/nvrtc.cc b/paddle/fluid/platform/dynload/nvrtc.cc
index a0322998277..242aa912ad8 100644
--- a/paddle/fluid/platform/dynload/nvrtc.cc
+++ b/paddle/fluid/platform/dynload/nvrtc.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/nvrtc.h"
+
 #include "paddle/phi/backends/dynload/nvrtc.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h
index 5ca8860c5ac..e03235e116f 100644
--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <nvrtc.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/nvrtc.h"
diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h
index 3f974eca1d0..c3dc9e31df3 100644
--- a/paddle/fluid/platform/dynload/nvtx.h
+++ b/paddle/fluid/platform/dynload/nvtx.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifndef _WIN32
 #include <cuda.h>
 #include <nvToolsExt.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/nvtx.h"
diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h
index 7bb4992c89c..2f874bb59f5 100644
--- a/paddle/fluid/platform/dynload/rccl.h
+++ b/paddle/fluid/platform/dynload/rccl.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <rccl.h>
 
 #include <mutex>  // NOLINT
+
 #include "paddle/phi/backends/dynload/rccl.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/rocblas.h b/paddle/fluid/platform/dynload/rocblas.h
index 04f4fdd9506..5cec6fb4879 100644
--- a/paddle/fluid/platform/dynload/rocblas.h
+++ b/paddle/fluid/platform/dynload/rocblas.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <hip/hip_runtime.h>
 #include <rocblas.h>
+
 #include <mutex>  // NOLINT
 #include <type_traits>
 
diff --git a/paddle/fluid/platform/dynload/rocm_driver.cc b/paddle/fluid/platform/dynload/rocm_driver.cc
index 088129f3f8d..4fa20c5c4bb 100644
--- a/paddle/fluid/platform/dynload/rocm_driver.cc
+++ b/paddle/fluid/platform/dynload/rocm_driver.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/rocm_driver.h"
+
 #include "paddle/phi/backends/dynload/rocm_driver.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h
index 5a902239fef..5c8e18611c4 100644
--- a/paddle/fluid/platform/dynload/rocm_driver.h
+++ b/paddle/fluid/platform/dynload/rocm_driver.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <hip/hip_runtime.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/rocm_driver.h"
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
index 8153877b7bb..8d700faac0c 100644
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/fluid/platform/dynload/tensorrt.h"
+
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 772a7750fe9..1106eef4559 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -36,6 +36,7 @@ limitations under the License. */
 #include <cusparse.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
+
 #include "paddle/fluid/platform/external_error.pb.h"
 #endif  // PADDLE_WITH_CUDA
 
@@ -77,6 +78,7 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/cusolver.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include <error.h>
+
 #include "paddle/phi/backends/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
@@ -88,6 +90,7 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/rocblas.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #include <error.h>  // NOLINT
+
 #include "paddle/phi/backends/dynload/rccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_HIP
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index b9e42392991..771c4853f6f 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -63,19 +63,22 @@ TEST(ENFORCE, FAILED) {
 TEST(ENFORCE, NO_ARG_OK) {
   int a = 2;
   int b = 2;
-  PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_EQ tests failed."));
+  PADDLE_ENFORCE_EQ(
+      a, b,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_EQ tests failed."));
   // test enforce with extra message.
-  PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable(
-                              "Some %s wrong in PADDLE_ENFORCE_EQ.", "info"));
+  PADDLE_ENFORCE_EQ(a, b,
+                    paddle::platform::errors::Unavailable(
+                        "Some %s wrong in PADDLE_ENFORCE_EQ.", "info"));
 }
 
 TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
   int a = 2;
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_EQ(a, 1 + 3, paddle::platform::errors::InvalidArgument(
-                                    "The result is not equal correct result."));
+    PADDLE_ENFORCE_EQ(a, 1 + 3,
+                      paddle::platform::errors::InvalidArgument(
+                          "The result is not equal correct result."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -89,8 +92,9 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
   int a = 2;
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_EQ(a, 1 + 3, paddle::platform::errors::InvalidArgument(
-                                    "The result is not equal correct result."));
+    PADDLE_ENFORCE_EQ(a, 1 + 3,
+                      paddle::platform::errors::InvalidArgument(
+                          "The result is not equal correct result."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -102,10 +106,12 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
 }
 
 TEST(ENFORCE_NE, OK) {
-  PADDLE_ENFORCE_NE(1, 2, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_NE tests failed."));
-  PADDLE_ENFORCE_NE(1.0, 2UL, paddle::platform::errors::Unavailable(
-                                  "PADDLE_ENFORCE_NE tests failed."));
+  PADDLE_ENFORCE_NE(
+      1, 2,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_NE tests failed."));
+  PADDLE_ENFORCE_NE(
+      1.0, 2UL,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_NE tests failed."));
 }
 TEST(ENFORCE_NE, FAIL) {
   bool caught_exception = false;
@@ -125,14 +131,16 @@ TEST(ENFORCE_NE, FAIL) {
 }
 
 TEST(ENFORCE_GT, OK) {
-  PADDLE_ENFORCE_GT(2, 1, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_GT tests failed."));
+  PADDLE_ENFORCE_GT(
+      2, 1,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_GT tests failed."));
 }
 TEST(ENFORCE_GT, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GT(1, 2, paddle::platform::errors::InvalidArgument(
-                                "Expected 1 > 2, but received 1:1 <= 2:2."));
+    PADDLE_ENFORCE_GT(1, 2,
+                      paddle::platform::errors::InvalidArgument(
+                          "Expected 1 > 2, but received 1:1 <= 2:2."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -143,18 +151,22 @@ TEST(ENFORCE_GT, FAIL) {
 }
 
 TEST(ENFORCE_GE, OK) {
-  PADDLE_ENFORCE_GE(2, 2, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_GE tests failed."));
-  PADDLE_ENFORCE_GE(3, 2, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_GE tests failed."));
-  PADDLE_ENFORCE_GE(3.21, 2.0, paddle::platform::errors::Unavailable(
-                                   "PADDLE_ENFORCE_GE tests failed."));
+  PADDLE_ENFORCE_GE(
+      2, 2,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_GE tests failed."));
+  PADDLE_ENFORCE_GE(
+      3, 2,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_GE tests failed."));
+  PADDLE_ENFORCE_GE(
+      3.21, 2.0,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_GE tests failed."));
 }
 TEST(ENFORCE_GE, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GE(1, 2, paddle::platform::errors::InvalidArgument(
-                                "Expected 1 >= 2, but received 1:1 < 2:2."));
+    PADDLE_ENFORCE_GE(1, 2,
+                      paddle::platform::errors::InvalidArgument(
+                          "Expected 1 >= 2, but received 1:1 < 2:2."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -165,22 +177,28 @@ TEST(ENFORCE_GE, FAIL) {
 }
 
 TEST(ENFORCE_LE, OK) {
-  PADDLE_ENFORCE_LE(1, 1, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_LE tests failed."));
-  PADDLE_ENFORCE_LE(1UL, 1UL, paddle::platform::errors::Unavailable(
-                                  "PADDLE_ENFORCE_LE tests failed."));
-  PADDLE_ENFORCE_LE(2, 3, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_LE tests failed."));
-  PADDLE_ENFORCE_LE(2UL, 3UL, paddle::platform::errors::Unavailable(
-                                  "PADDLE_ENFORCE_LE tests failed."));
-  PADDLE_ENFORCE_LE(2.0, 3.2, paddle::platform::errors::Unavailable(
-                                  "PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(
+      1, 1,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(
+      1UL, 1UL,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(
+      2, 3,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(
+      2UL, 3UL,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(
+      2.0, 3.2,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LE tests failed."));
 }
 TEST(ENFORCE_LE, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GT(1, 2, paddle::platform::errors::InvalidArgument(
-                                "Expected 1 > 2, but received 1:1 <= 2:2."));
+    PADDLE_ENFORCE_GT(1, 2,
+                      paddle::platform::errors::InvalidArgument(
+                          "Expected 1 > 2, but received 1:1 <= 2:2."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -191,12 +209,15 @@ TEST(ENFORCE_LE, FAIL) {
 }
 
 TEST(ENFORCE_LT, OK) {
-  PADDLE_ENFORCE_LT(3, 10, paddle::platform::errors::Unavailable(
-                               "PADDLE_ENFORCE_LT tests failed."));
-  PADDLE_ENFORCE_LT(2UL, 3UL, paddle::platform::errors::Unavailable(
-                                  "PADDLE_ENFORCE_LT tests failed."));
-  PADDLE_ENFORCE_LT(2, 3, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_LT tests failed."));
+  PADDLE_ENFORCE_LT(
+      3, 10,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LT tests failed."));
+  PADDLE_ENFORCE_LT(
+      2UL, 3UL,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LT tests failed."));
+  PADDLE_ENFORCE_LT(
+      2, 3,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_LT tests failed."));
 }
 TEST(ENFORCE_LT, FAIL) {
   bool caught_exception = false;
@@ -263,16 +284,18 @@ std::ostream& operator<<(std::ostream& os, const Dims& d) {
 
 TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
   Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}};
-  PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable(
-                              "PADDLE_ENFORCE_EQ tests failed."));
+  PADDLE_ENFORCE_EQ(
+      a, b,
+      paddle::platform::errors::Unavailable("PADDLE_ENFORCE_EQ tests failed."));
 }
 
 TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
   Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable(
-                                "PADDLE_ENFORCE_EQ tests failed."));
+    PADDLE_ENFORCE_EQ(a, b,
+                      paddle::platform::errors::Unavailable(
+                          "PADDLE_ENFORCE_EQ tests failed."));
   } catch (paddle::platform::EnforceNotMet&) {
     caught_exception = true;
   }
@@ -481,10 +504,12 @@ TEST(enforce, cannot_to_string_type) {
                 "int can be converted to string");
   CannotToStringType obj1(3), obj2(4), obj3(3);
 
-  PADDLE_ENFORCE_NE(obj1, obj2, paddle::platform::errors::InvalidArgument(
-                                    "Object 1 is not equal to Object 2"));
-  PADDLE_ENFORCE_EQ(obj1, obj3, paddle::platform::errors::InvalidArgument(
-                                    "Object 1 is equal to Object 3"));
+  PADDLE_ENFORCE_NE(obj1, obj2,
+                    paddle::platform::errors::InvalidArgument(
+                        "Object 1 is not equal to Object 2"));
+  PADDLE_ENFORCE_EQ(obj1, obj3,
+                    paddle::platform::errors::InvalidArgument(
+                        "Object 1 is equal to Object 3"));
 
   std::string msg = "Compare obj1 with obj2";
   try {
diff --git a/paddle/fluid/platform/errors.h b/paddle/fluid/platform/errors.h
index 57f5b3a7c93..758af3e2d91 100644
--- a/paddle/fluid/platform/errors.h
+++ b/paddle/fluid/platform/errors.h
@@ -18,5 +18,5 @@ namespace paddle {
 namespace platform {
 namespace errors = ::phi::errors;
 using error = ::phi::ErrorCode;
-}
-}
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/errors_test.cc b/paddle/fluid/platform/errors_test.cc
index 712b67a654c..8b11c1d2d24 100644
--- a/paddle/fluid/platform/errors_test.cc
+++ b/paddle/fluid/platform/errors_test.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/errors.h"
+
 #include <string>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/errors.h"
 
 using namespace paddle::platform::errors;  // NOLINT
 
diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h
index bef551078b3..f2a150c3012 100644
--- a/paddle/fluid/platform/fast_divmod.h
+++ b/paddle/fluid/platform/fast_divmod.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
+
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 #define INT_BITS 32
diff --git a/paddle/fluid/platform/flags.h b/paddle/fluid/platform/flags.h
index b9d78c2e9dc..0a38d612939 100644
--- a/paddle/fluid/platform/flags.h
+++ b/paddle/fluid/platform/flags.h
@@ -18,6 +18,7 @@
 #include <map>
 #include <string>
 #include <type_traits>
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/variant.h"
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 2c00854e082..dc7fdc6b443 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <bitset>
 #include <iostream>
 
diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc
index bbec743d26f..45ca4a6f277 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <netinet/in.h>
 #include <stdlib.h>
 #include <sys/socket.h>
+
 #include <algorithm>
 #include <string>
 #include <thread>  // NOLINT
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index 5301dd30759..bc5bd274bf8 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/init.h"
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_MLU
diff --git a/paddle/fluid/platform/lock_guard_ptr.h b/paddle/fluid/platform/lock_guard_ptr.h
index bff24e74a70..66d6e446d3f 100644
--- a/paddle/fluid/platform/lock_guard_ptr.h
+++ b/paddle/fluid/platform/lock_guard_ptr.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <stdint.h>
+
 #include <memory>
 #include <mutex>  // NOLINT
 namespace paddle {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 5476d244f60..382f96e83bf 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -1061,16 +1061,18 @@ static void SetDstMemoryQuantized(
   const size_t dst_dims = dst_tz.size();
   MKLDNNMemoryFormat dst_fmt;
 
-  PADDLE_ENFORCE_LE(dst_dims, 5, platform::errors::InvalidArgument(
-                                     "Dst memory for quantization can not have "
-                                     "dims > 5. But received dst_dims is %d.",
-                                     dst_dims));
+  PADDLE_ENFORCE_LE(dst_dims, 5,
+                    platform::errors::InvalidArgument(
+                        "Dst memory for quantization can not have "
+                        "dims > 5. But received dst_dims is %d.",
+                        dst_dims));
   dst_fmt = platform::MKLDNNFormatForSize(dst_dims, output_format);
 
-  auto tmp_dst_md = platform::MKLDNNMemDesc(
-      {dst_tz}, paddle::framework::ToMKLDNNDataType(
-                    framework::DataTypeTrait<T>::DataType()),
-      dst_fmt);
+  auto tmp_dst_md =
+      platform::MKLDNNMemDesc({dst_tz},
+                              paddle::framework::ToMKLDNNDataType(
+                                  framework::DataTypeTrait<T>::DataType()),
+                              dst_fmt);
   dst_md.reset(new dnnl::memory::desc(tmp_dst_md));
   dst_memory.reset(
       new dnnl::memory(*dst_md, engine, to_void_cast<T>(output_data)));
diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h
index dc9abaf36d8..e7612f6dcb6 100644
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <stdio.h>
+
 #include <atomic>
 #include <memory>
 #include <mutex>  // NOLINT
diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc
index 36dd7891d55..694f701b5ad 100644
--- a/paddle/fluid/platform/os_info.cc
+++ b/paddle/fluid/platform/os_info.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/os_info.h"
+
 #include <functional>
 #include <sstream>
 #include <thread>
diff --git a/paddle/fluid/platform/os_info_test.cc b/paddle/fluid/platform/os_info_test.cc
index b3311f1d19e..149da6ba27a 100644
--- a/paddle/fluid/platform/os_info_test.cc
+++ b/paddle/fluid/platform/os_info_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/platform/os_info.h"
+
 #include <thread>
+
 #include "gtest/gtest.h"
 
 TEST(ThreadInfo, TestThreadIdUtils) {
-  using paddle::platform::GetCurrentThreadStdId;
-  using paddle::platform::GetCurrentThreadId;
   using paddle::platform::GetAllThreadIds;
+  using paddle::platform::GetCurrentThreadId;
+  using paddle::platform::GetCurrentThreadStdId;
   EXPECT_EQ(std::hash<std::thread::id>()(std::this_thread::get_id()),
             GetCurrentThreadId().std_tid);
   auto ids = GetAllThreadIds();
@@ -26,10 +28,10 @@ TEST(ThreadInfo, TestThreadIdUtils) {
 }
 
 TEST(ThreadInfo, TestThreadNameUtils) {
-  using paddle::platform::GetCurrentThreadStdId;
+  using paddle::platform::GetAllThreadNames;
   using paddle::platform::GetCurrentThreadName;
+  using paddle::platform::GetCurrentThreadStdId;
   using paddle::platform::SetCurrentThreadName;
-  using paddle::platform::GetAllThreadNames;
   SetCurrentThreadName("MainThread");
   EXPECT_FALSE(SetCurrentThreadName("MainThread"));
   auto names = GetAllThreadNames();
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 75abf36e676..c573650f179 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/profiler.h"
+
 #include <mutex>  // NOLINT
 #include <random>
 #include <sstream>
@@ -20,7 +22,6 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/common_event.h"
 #include "paddle/fluid/platform/profiler/host_event_recorder.h"
 #include "paddle/fluid/platform/profiler/host_tracer.h"
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 4ee95a530fb..f728a820bd7 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/profiler/chrometracing_logger.h"
+
 #include <cstdio>
 #include <ctime>
 #include <limits>
 
 #include "glog/logging.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler/chrometracing_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
 #include "paddle/fluid/platform/profiler/utils.h"
 
@@ -304,9 +304,10 @@ void ChromeTracingLogger::HandleTypeKernel(
   blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
                                      kernel_info.grid_z) /
                   device_property.multiProcessorCount;
-  warps_per_sm = blocks_per_sm * (kernel_info.block_x * kernel_info.block_y *
-                                  kernel_info.block_z) /
-                 threads_per_warp;
+  warps_per_sm =
+      blocks_per_sm *
+      (kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) /
+      threads_per_warp;
   occupancy = CalculateEstOccupancy(
       device_node.DeviceId(), kernel_info.registers_per_thread,
       kernel_info.static_shared_memory, kernel_info.dynamic_shared_memory,
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index 8977ab748c6..12d98d1ef0c 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <set>
 #include <unordered_map>
 #include <utility>
+
 #include "paddle/fluid/platform/profiler/output_logger.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/profiler/common_event.h b/paddle/fluid/platform/profiler/common_event.h
index cfdc3be110a..8fe3b150523 100644
--- a/paddle/fluid/platform/profiler/common_event.h
+++ b/paddle/fluid/platform/profiler/common_event.h
@@ -17,6 +17,7 @@
 #include <cstring>
 #include <functional>
 #include <string>
+
 #include "paddle/fluid/platform/event.h"  // import EventRole, TODO(TIEXING): remove later
 #include "paddle/fluid/platform/profiler/trace_event.h"
 
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc
index d507153d3f5..4319841c8a9 100644
--- a/paddle/fluid/platform/profiler/cpu_utilization.cc
+++ b/paddle/fluid/platform/profiler/cpu_utilization.cc
@@ -54,12 +54,13 @@ void CpuUtilization::RecordBeginTimeInfo() {
   if (stat_file != nullptr) {
     char temp_str[200];
     uint64_t temp_lu;
-    int retval = fscanf(
-        stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
-                   "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
-        temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
-        &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, &irq_start_,
-        &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
+    int retval =
+        fscanf(stat_file,
+               "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+               "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+               temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
+               &system_tms_start_.tms_stime, &idle_start_, &iowait_start_,
+               &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
     if (retval != 11) {
       LOG(WARNING)
           << "Failed to read cpu utilization information at record beginning."
@@ -87,12 +88,13 @@ void CpuUtilization::RecordEndTimeInfo() {
   if (stat_file != nullptr) {
     char temp_str[200];
     uint64_t temp_lu;
-    int retval = fscanf(
-        stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
-                   "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
-        temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
-        &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
-        &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
+    int retval =
+        fscanf(stat_file,
+               "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+               "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+               temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
+               &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
+               &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
 
     if (retval != 11) {
       LOG(WARNING)
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.h b/paddle/fluid/platform/profiler/cpu_utilization.h
index 7b05a6302cd..aa25ae5a43c 100644
--- a/paddle/fluid/platform/profiler/cpu_utilization.h
+++ b/paddle/fluid/platform/profiler/cpu_utilization.h
@@ -15,8 +15,10 @@
 #pragma once
 
 #include <stdio.h>
+
 #include <cinttypes>
 #include <cstdint>
+
 #include "glog/logging.h"
 #ifdef _MSC_VER
 #include <windows.h>
diff --git a/paddle/fluid/platform/profiler/cuda_tracer.cc b/paddle/fluid/platform/profiler/cuda_tracer.cc
index 2d3e354dc27..9e32f7bbf19 100644
--- a/paddle/fluid/platform/profiler/cuda_tracer.cc
+++ b/paddle/fluid/platform/profiler/cuda_tracer.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/cuda_tracer.h"
+
 #include <string>
 #include <unordered_map>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/platform/os_info.h"
diff --git a/paddle/fluid/platform/profiler/cuda_tracer.h b/paddle/fluid/platform/profiler/cuda_tracer.h
index 20a60521266..36c5ab4eb55 100644
--- a/paddle/fluid/platform/profiler/cuda_tracer.h
+++ b/paddle/fluid/platform/profiler/cuda_tracer.h
@@ -17,6 +17,7 @@
 #include <cstdint>
 #include <mutex>
 #include <vector>
+
 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/profiler/tracer_base.h"
diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
index da12dccb749..7cb8b597dcd 100644
--- a/paddle/fluid/platform/profiler/cupti_data_process.cc
+++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/cupti_data_process.h"
+
 #include <cstdio>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
 
diff --git a/paddle/fluid/platform/profiler/cupti_data_process.h b/paddle/fluid/platform/profiler/cupti_data_process.h
index 01b2e72ade4..7b800464734 100644
--- a/paddle/fluid/platform/profiler/cupti_data_process.h
+++ b/paddle/fluid/platform/profiler/cupti_data_process.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <unordered_map>
+
 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/profiler/trace_event_collector.h"
 
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index de3411579d3..82363fcff63 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -9,7 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
+
 #include <cstring>
+
 #include "paddle/fluid/platform/profiler/extra_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index 73021f4362a..b8afe2af0e7 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -9,9 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "glog/logging.h"
-
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
+
+#include "glog/logging.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
 #include "paddle/fluid/platform/profiler/extra_info.h"
 #include "paddle/fluid/platform/profiler/utils.h"
diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
index d294bfee58c..5253ecc505d 100644
--- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -13,26 +13,25 @@
 // limitations under the License.
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
 #include "paddle/fluid/platform/profiler/event_python.h"
 
-using paddle::platform::SerializationLogger;
-using paddle::platform::DeserializationReader;
-using paddle::platform::NodeTrees;
-using paddle::platform::HostTraceEventNode;
 using paddle::platform::CudaRuntimeTraceEventNode;
+using paddle::platform::DeserializationReader;
+using paddle::platform::DeviceTraceEvent;
 using paddle::platform::DeviceTraceEventNode;
 using paddle::platform::HostTraceEvent;
-using paddle::platform::RuntimeTraceEvent;
-using paddle::platform::DeviceTraceEvent;
-using paddle::platform::TracerEventType;
+using paddle::platform::HostTraceEventNode;
 using paddle::platform::KernelEventInfo;
 using paddle::platform::MemcpyEventInfo;
 using paddle::platform::MemsetEventInfo;
+using paddle::platform::NodeTrees;
 using paddle::platform::ProfilerResult;
+using paddle::platform::RuntimeTraceEvent;
+using paddle::platform::SerializationLogger;
+using paddle::platform::TracerEventType;
 
 TEST(SerializationLoggerTest, dump_case0) {
   std::list<HostTraceEvent> host_events;
diff --git a/paddle/fluid/platform/profiler/event_node.cc b/paddle/fluid/platform/profiler/event_node.cc
index b909fb5f25a..e1af63ad890 100644
--- a/paddle/fluid/platform/profiler/event_node.cc
+++ b/paddle/fluid/platform/profiler/event_node.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/event_node.h"
 
 #include <limits.h>
+
 #include <algorithm>
 #include <deque>
 #include <set>
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
index 5c42c8e8bf6..abde62c6b14 100644
--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler/event_python.h"
+
 #include "paddle/fluid/platform/profiler/chrometracing_logger.h"
 #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
diff --git a/paddle/fluid/platform/profiler/event_tracing.h b/paddle/fluid/platform/profiler/event_tracing.h
index fcaba9a43ca..fd81c15f92a 100644
--- a/paddle/fluid/platform/profiler/event_tracing.h
+++ b/paddle/fluid/platform/profiler/event_tracing.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/profiler/trace_event.h"
 
@@ -70,10 +71,11 @@ class RecordEvent {
    * @param level: Used to filter events, works like glog VLOG(level).
    * RecordEvent will works if HostTraceLevel >= level.
    */
-  explicit RecordEvent(const char* name, const TracerEventType type =
-                                             TracerEventType::UserDefined,
-                       uint32_t level = kDefaultTraceLevel,
-                       const EventRole role = EventRole::kOrdinary);
+  explicit RecordEvent(
+      const char* name,
+      const TracerEventType type = TracerEventType::UserDefined,
+      uint32_t level = kDefaultTraceLevel,
+      const EventRole role = EventRole::kOrdinary);
 
   RecordEvent(const std::string& name, const std::string& attr,
               const TracerEventType type = TracerEventType::UserDefined,
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h
index afd41352465..1359c3b85a0 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <type_traits>
 #include <vector>
+
 #include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/os_info.h"
@@ -58,7 +59,7 @@ class EventContainer {
  public:
   // Record an event
   template <typename... Args>
-  void Record(Args &&... args) {
+  void Record(Args &&...args) {
     DoRecord(ContainsStdString<Args...>(), std::forward<Args>(args)...);
   }
 
@@ -112,7 +113,7 @@ class EventContainer {
 
   // Record an event with string arguments
   template <typename... Args>
-  void DoRecord(std::true_type, Args &&... args) {
+  void DoRecord(std::true_type, Args &&...args) {
     auto *storage = GetEventStorage();
     std::function<void *(size_t)> allocator = [this](size_t size) {
       return GetStrBufFromArena(size);
@@ -122,7 +123,7 @@ class EventContainer {
 
   // Record an event without any string argument
   template <typename... Args>
-  void DoRecord(std::false_type, Args &&... args) {
+  void DoRecord(std::false_type, Args &&...args) {
     auto *storage = GetEventStorage();
     new (storage) EventType(std::forward<Args>(args)...);
   }
@@ -199,7 +200,7 @@ class ThreadEventRecorder {
  public:
   // Forward call to EventContainer::Record
   template <typename... Args>
-  void RecordEvent(Args &&... args) {
+  void RecordEvent(Args &&...args) {
     base_evt_cntr_.Record(std::forward<Args>(args)...);
   }
 
@@ -237,7 +238,7 @@ class HostEventRecorder {
   // Do your best to avoid using 'std::string' as the argument type.
   // It will cause deep-copy to harm performance.
   template <typename... Args>
-  void RecordEvent(Args &&... args) {
+  void RecordEvent(Args &&...args) {
     GetThreadLocalRecorder()->RecordEvent(std::forward<Args>(args)...);
   }
 
diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc
index b7eb53331b7..8a36a3a8bab 100644
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/host_tracer.h"
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/profiler/common_event.h"
diff --git a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
index 36abf77279d..7afdb5eb2a3 100644
--- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
+++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
+
 #include <cstdio>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
 
diff --git a/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
index 2d719a8bbfd..bbaafa3faa6 100644
--- a/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
+++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
+
 #include <string>
 #include <unordered_map>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/platform/os_info.h"
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index a417eda1509..8bcf856c01a 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/profiler/profiler.h"
+
 #include "glog/logging.h"
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h
index ea346a4fb74..65a3bcc02d8 100644
--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -20,6 +20,7 @@
 #include <functional>
 #include <list>
 #include <memory>
+
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/profiler/cpu_utilization.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc
index f2c867ffff2..1f1fbcb71ec 100644
--- a/paddle/fluid/platform/profiler/profiler_test.cc
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
@@ -14,6 +14,7 @@
 
 #include <set>
 #include <string>
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #ifdef PADDLE_WITH_CUDA
@@ -27,11 +28,11 @@
 #include "paddle/fluid/platform/profiler/profiler.h"
 
 TEST(ProfilerTest, TestHostTracer) {
-  using paddle::platform::ProfilerOptions;
   using paddle::platform::Profiler;
+  using paddle::platform::ProfilerOptions;
+  using paddle::platform::ProfilerResult;
   using paddle::platform::RecordInstantEvent;
   using paddle::platform::TracerEventType;
-  using paddle::platform::ProfilerResult;
   ProfilerOptions options;
   options.trace_level = 2;
   options.trace_switch = 3;
@@ -58,8 +59,8 @@ TEST(ProfilerTest, TestHostTracer) {
 }
 
 TEST(ProfilerTest, TestCudaTracer) {
-  using paddle::platform::ProfilerOptions;
   using paddle::platform::Profiler;
+  using paddle::platform::ProfilerOptions;
   using paddle::platform::ProfilerResult;
   ProfilerOptions options;
   options.trace_level = 0;
diff --git a/paddle/fluid/platform/profiler/test_event_node.cc b/paddle/fluid/platform/profiler/test_event_node.cc
index b8d1306ad07..23ad917b57d 100644
--- a/paddle/fluid/platform/profiler/test_event_node.cc
+++ b/paddle/fluid/platform/profiler/test_event_node.cc
@@ -13,22 +13,21 @@
 // limitations under the License.
 
 #include "gtest/gtest.h"
-
 #include "paddle/fluid/platform/profiler/chrometracing_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
 
 using paddle::platform::ChromeTracingLogger;
-using paddle::platform::NodeTrees;
-using paddle::platform::HostTraceEventNode;
 using paddle::platform::CudaRuntimeTraceEventNode;
+using paddle::platform::DeviceTraceEvent;
 using paddle::platform::DeviceTraceEventNode;
 using paddle::platform::HostTraceEvent;
-using paddle::platform::RuntimeTraceEvent;
-using paddle::platform::DeviceTraceEvent;
-using paddle::platform::TracerEventType;
+using paddle::platform::HostTraceEventNode;
 using paddle::platform::KernelEventInfo;
 using paddle::platform::MemcpyEventInfo;
 using paddle::platform::MemsetEventInfo;
+using paddle::platform::NodeTrees;
+using paddle::platform::RuntimeTraceEvent;
+using paddle::platform::TracerEventType;
 TEST(NodeTreesTest, LogMe_case0) {
   std::list<HostTraceEvent> host_events;
   std::list<RuntimeTraceEvent> runtime_events;
@@ -194,8 +193,10 @@ TEST(NodeTreesTest, HandleTrees_case0) {
   }
   std::function<void(HostTraceEventNode*)> host_event_node_handle(
       [&](HostTraceEventNode* a) { logger.LogHostTraceEventNode(*a); });
-  std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle([&](
-      CudaRuntimeTraceEventNode* a) { logger.LogRuntimeTraceEventNode(*a); });
+  std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle(
+      [&](CudaRuntimeTraceEventNode* a) {
+        logger.LogRuntimeTraceEventNode(*a);
+      });
   std::function<void(DeviceTraceEventNode*)> device_event_node_handle(
       [&](DeviceTraceEventNode* a) { logger.LogDeviceTraceEventNode(*a); });
   tree.HandleTrees(host_event_node_handle, runtime_event_node_handle,
diff --git a/paddle/fluid/platform/profiler/trace_event_collector.h b/paddle/fluid/platform/profiler/trace_event_collector.h
index 5f2bc9dc90d..d1593bc1bfc 100644
--- a/paddle/fluid/platform/profiler/trace_event_collector.h
+++ b/paddle/fluid/platform/profiler/trace_event_collector.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <list>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/fluid/platform/profiler/trace_event.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h
index 06d1636c461..433fd0b825a 100644
--- a/paddle/fluid/platform/profiler/utils.h
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <ctime>
 #include <string>
+
 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
@@ -26,8 +27,9 @@ template <typename... Args>
 std::string string_format(const std::string& format, Args... args) {
   int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) +
                1;  // Extra space for '\0'
-  PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal(
-                                   "Error during profiler data formatting."));
+  PADDLE_ENFORCE_GE(
+      size_s, 0,
+      platform::errors::Fatal("Error during profiler data formatting."));
   auto size = static_cast<size_t>(size_s);
   auto buf = std::make_unique<char[]>(size);
   std::snprintf(buf.get(), size, format.c_str(), args...);
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index f64e05504aa..ae856044f8f 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -390,8 +390,8 @@ void SetEvent(bool merge_thread, const Event &analyze_event,
             index++;
           }
           if (split_pos == -1 && !main_thread_event_name.count(rit->name())) {
-            event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
-                         rit->name();
+            event_name = "thread" + std::to_string(rit->thread_id()) +
+                         "::" + rit->name();
           } else {
             if (!main_thread_event_name.count(rit->name())) {
               event_name =
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index e9f84a49246..18d4b4dc834 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -36,24 +36,24 @@ TEST(Event, CpuElapsedTime) {
 
 TEST(RecordEvent, RecordEvent) {
   using paddle::platform::Event;
+  using paddle::platform::EventRole;
+  using paddle::platform::EventSortingKey;
   using paddle::platform::EventType;
-  using paddle::platform::RecordEvent;
-  using paddle::platform::PushEvent;
   using paddle::platform::PopEvent;
   using paddle::platform::ProfilerState;
-  using paddle::platform::EventSortingKey;
-  using paddle::platform::EventRole;
+  using paddle::platform::PushEvent;
+  using paddle::platform::RecordEvent;
 
   ProfilerState state = ProfilerState::kCPU;
   EnableProfiler(state);
 
   /* Usage 1:
-  *  PushEvent(evt_name);
-  *  ...
-  *  code to be analyzed
-  *  ...
-  * PopEvent(evt_name);
-  */
+   *  PushEvent(evt_name);
+   *  ...
+   *  code to be analyzed
+   *  ...
+   * PopEvent(evt_name);
+   */
   LOG(INFO) << "Usage 1: PushEvent & PopEvent";
   for (int loop = 0; loop < 3; ++loop) {
     for (int i = 1; i < 5; ++i) {
diff --git a/paddle/fluid/platform/resource_pool.h b/paddle/fluid/platform/resource_pool.h
index f01d006d5b2..737001a50ab 100644
--- a/paddle/fluid/platform/resource_pool.h
+++ b/paddle/fluid/platform/resource_pool.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index e3e735d03ab..d7f60e4019d 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/stream/cuda_stream.h"
+
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 6fa326d57bc..bb9a405798b 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/stream_callback_manager.h"
+
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
index 6f714a67703..32c759d0102 100644
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
+
 #include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
 #endif
 
diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
index 2e7b8b402f6..1caa2e87707 100644
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
@@ -38,10 +39,10 @@ class Multiply {
 using paddle::memory::Alloc;
 using paddle::memory::Copy;
 
-using paddle::platform::CPUPlace;
-using paddle::platform::CUDAPlace;
 using paddle::platform::CPUDeviceContext;
+using paddle::platform::CPUPlace;
 using paddle::platform::CUDADeviceContext;
+using paddle::platform::CUDAPlace;
 
 using paddle::platform::Transform;
 
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc
index fdf3a12a81f..8c1eb2c1b90 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.cc
+++ b/paddle/fluid/pybind/ascend_wrapper_py.cc
@@ -26,11 +26,13 @@ limitations under the License. */
 #include <ge/ge_api.h>
 #include <graph/attr_value.h>
 #include <graph/operator_factory.h>
+
 #include <map>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 #include "paddle/fluid/platform/device/npu/ascend_npu_info.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -78,8 +80,9 @@ ge::Status ge_initialize(
   py::gil_scoped_release release;
   auto init_options = convert_map(options);
   ge::Status res = ge::GEInitialize(init_options);
-  PADDLE_ENFORCE_EQ(res, ge::SUCCESS, platform::errors::Fatal(
-                                          "ge initialize not success:%d", res));
+  PADDLE_ENFORCE_EQ(
+      res, ge::SUCCESS,
+      platform::errors::Fatal("ge initialize not success:%d", res));
   py::gil_scoped_acquire acquire;
   return res;
 }
@@ -253,7 +256,7 @@ void BindAscendGraph(py::module *m) {
         return std::unique_ptr<ge::Session>(
             new ge::Session(convert_map(options)));
       }))
-      .def("add_graph", (ge::Status (Session::*)(uint32_t, const Graph &)) &
+      .def("add_graph", (ge::Status(Session::*)(uint32_t, const Graph &)) &
                             Session::AddGraph)
       .def("add_graph",
            [](Session &ss, uint32_t index, const Graph &graph,
@@ -261,14 +264,15 @@ void BindAscendGraph(py::module *m) {
              return ss.AddGraph(index, graph, convert_map(options));
            })
       .def("remove_graph", &Session::RemoveGraph)
-      .def("run_graph",
-           [](Session &ss, uint32_t graphId,
-              const std::vector<Tensor> &inputs) -> py::tuple {
-             std::vector<Tensor> outputs;
-             ge::Status res = ss.RunGraph(graphId, inputs, outputs);
-             return py::make_tuple(outputs, res);
-           },
-           py::call_guard<py::gil_scoped_release>())
+      .def(
+          "run_graph",
+          [](Session &ss, uint32_t graphId,
+             const std::vector<Tensor> &inputs) -> py::tuple {
+            std::vector<Tensor> outputs;
+            ge::Status res = ss.RunGraph(graphId, inputs, outputs);
+            return py::make_tuple(outputs, res);
+          },
+          py::call_guard<py::gil_scoped_release>())
       .def("build_graph", &Session::BuildGraph)
       .def("run_graph_async", &Session::RunGraphAsync)
 #ifdef PADDLE_WITH_ASCEND_STRING
@@ -385,7 +389,7 @@ void BindAscendGraph(py::module *m) {
            })
 #ifdef PADDLE_WITH_ASCEND_STRING
       .def("get_input_desc",
-           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc)
+           (TensorDesc(Operator::*)(uint32_t) const) & Operator::GetInputDesc)
       .def("get_input_desc",
            [](Operator &op, const std::string &name) {
              return op.GetInputDescByName(name.c_str());
@@ -420,7 +424,7 @@ void BindAscendGraph(py::module *m) {
              return op.GetOutputDescByName(name.c_str());
            })
       .def("get_output_desc",
-           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetOutputDesc)
+           (TensorDesc(Operator::*)(uint32_t) const) & Operator::GetOutputDesc)
       .def("update_output_desc",
            static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
                const char *, const TensorDesc &)>(&Operator::UpdateOutputDesc))
@@ -779,19 +783,18 @@ void BindAscendGraph(py::module *m) {
       .def("get_tensor_desc", &Tensor::GetTensorDesc)
       // .def("set_data", (graphStatus(Tensor::*)(std::vector<uint8_t> &&)) &
       // Tensor::SetData)
-      .def("set_data", (graphStatus (Tensor::*)(const std::vector<uint8_t> &)) &
+      .def("set_data", (graphStatus(Tensor::*)(const std::vector<uint8_t> &)) &
                            Tensor::SetData)
       .def("set_data",
-           (graphStatus (Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData)
+           (graphStatus(Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData)
 #ifdef PADDLE_WITH_ASCEND_STRING
-      .def("set_data",
-           (graphStatus (Tensor::*)(const char *)) & Tensor::SetData)
+      .def("set_data", (graphStatus(Tensor::*)(const char *)) & Tensor::SetData)
 #else
       .def("set_data",
            (graphStatus (Tensor::*)(const std::string &)) & Tensor::SetData)
 #endif
       .def("set_data",
-           (graphStatus (Tensor::*)(const std::vector<AscendString> &)) &
+           (graphStatus(Tensor::*)(const std::vector<AscendString> &)) &
                Tensor::SetData)
 
       .def("get_data",
@@ -813,8 +816,9 @@ void BindAscendGraph(py::module *m) {
       .def(py::init<Shape, Format, DataType>(), py::arg("shape"),
            py::arg("format") = FORMAT_ND, py::arg("dt") = DT_FLOAT)
       .def(py::init<const TensorDesc &>())
-      .def("update", (void (TensorDesc::*)(const Shape &, Format, DataType)) &
-                         TensorDesc::Update,
+      .def("update",
+           (void(TensorDesc::*)(const Shape &, Format, DataType)) &
+               TensorDesc::Update,
            py::arg("shape"), py::arg("format") = FORMAT_ND,
            py::arg("dt") = DT_FLOAT)
       .def("set_shape", &TensorDesc::SetShape)
diff --git a/paddle/fluid/pybind/bind_cost_model.cc b/paddle/fluid/pybind/bind_cost_model.cc
index a4a40f1fd02..ef2fe0dd3d4 100644
--- a/paddle/fluid/pybind/bind_cost_model.cc
+++ b/paddle/fluid/pybind/bind_cost_model.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/pybind/bind_cost_model.h"
 
 #include <pybind11/stl.h>
+
 #include "paddle/fluid/framework/ir/cost_model.h"
 #include "paddle/fluid/framework/program_desc.h"
 
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 8491d1e2249..6bd03203744 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -13,10 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/bind_fleet_executor.h"
+
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
+
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/distributed/fleet_executor/dist_model.h"
 #include "paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
@@ -62,13 +65,13 @@ struct npy_format_descriptor<paddle::platform::float16> {
 namespace paddle {
 namespace pybind {
 
-using paddle::distributed::FleetExecutor;
-using paddle::distributed::TaskNode;
-using paddle::distributed::DistModelConfig;
 using paddle::distributed::DistModel;
+using paddle::distributed::DistModelConfig;
 using paddle::distributed::DistModelDataBuf;
-using paddle::distributed::DistModelTensor;
 using paddle::distributed::DistModelDataType;
+using paddle::distributed::DistModelTensor;
+using paddle::distributed::FleetExecutor;
+using paddle::distributed::TaskNode;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 
@@ -217,33 +220,34 @@ void BindFleetExecutor(py::module* m) {
       .def("reset", &DistModelDataBufReset<float>)
       .def("reset", &DistModelDataBufReset<paddle::platform::float16>)
       .def("length", &DistModelDataBuf::length)
-      .def("tolist", [](DistModelDataBuf& self,
-                        const std::string& dtype) -> py::list {
-        py::list l;
-        if (dtype == "int32") {
-          auto* data = static_cast<int32_t*>(self.data());
-          auto size = self.length() / sizeof(int32_t);
-          l = py::cast(std::vector<int32_t>(data, data + size));
-        } else if (dtype == "int64") {
-          auto* data = static_cast<int64_t*>(self.data());
-          auto size = self.length() / sizeof(int64_t);
-          l = py::cast(std::vector<int64_t>(data, data + size));
-        } else if (dtype == "float32") {
-          auto* data = static_cast<float*>(self.data());
-          auto size = self.length() / sizeof(float);
-          l = py::cast(std::vector<float>(data, data + size));
-        } else if (dtype == "float16") {
-          auto* data = static_cast<paddle::platform::float16*>(self.data());
-          auto size = self.length() / sizeof(paddle::platform::float16);
-          l = py::cast(
-              std::vector<paddle::platform::float16>(data, data + size));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported data type. Now only supports INT32, INT64, "
-              "FLOAT16 and FLOAT32."));
-        }
-        return l;
-      });
+      .def("tolist",
+           [](DistModelDataBuf& self, const std::string& dtype) -> py::list {
+             py::list l;
+             if (dtype == "int32") {
+               auto* data = static_cast<int32_t*>(self.data());
+               auto size = self.length() / sizeof(int32_t);
+               l = py::cast(std::vector<int32_t>(data, data + size));
+             } else if (dtype == "int64") {
+               auto* data = static_cast<int64_t*>(self.data());
+               auto size = self.length() / sizeof(int64_t);
+               l = py::cast(std::vector<int64_t>(data, data + size));
+             } else if (dtype == "float32") {
+               auto* data = static_cast<float*>(self.data());
+               auto size = self.length() / sizeof(float);
+               l = py::cast(std::vector<float>(data, data + size));
+             } else if (dtype == "float16") {
+               auto* data =
+                   static_cast<paddle::platform::float16*>(self.data());
+               auto size = self.length() / sizeof(paddle::platform::float16);
+               l = py::cast(
+                   std::vector<paddle::platform::float16>(data, data + size));
+             } else {
+               PADDLE_THROW(platform::errors::Unimplemented(
+                   "Unsupported data type. Now only supports INT32, INT64, "
+                   "FLOAT16 and FLOAT32."));
+             }
+             return l;
+           });
 
   py::class_<DistModelTensor>(*m, "DistModelTensor")
       .def(py::init<>())
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index aef02d65b4d..418804df028 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/pybind/communication.h"
+
 #include <Python.h>
 #include <pybind11/chrono.h>
 #include <pybind11/complex.h>
 #include <pybind11/functional.h>
 #include <pybind11/stl.h>
+
 #include <chrono>
 #include <string>
 
 #include "paddle/fluid/distributed/store/tcp_store.h"
-#include "paddle/fluid/pybind/communication.h"
 
 namespace py = pybind11;
 
@@ -35,22 +37,24 @@ void BindTCPStore(py::module *m) {
       py::class_<distributed::Store, std::shared_ptr<distributed::Store>>(
           *m, "Store")
           .def(py::init<>())
-          .def("set",
-               [](distributed::Store &self, const std::string &key,
-                  const std::string &value) {
-                 std::vector<uint8_t> data(value.begin(), value.end());
-                 self.set(key, data);
-               },
-               py::arg("key"), py::arg("value"),
-               py::call_guard<py::gil_scoped_release>())
-          .def("get",
-               [](distributed::Store &self,
-                  const std::string &key) -> py::bytes {
-                 auto data = self.get(key);
-                 return py::bytes(reinterpret_cast<char *>(data.data()),
-                                  data.size());
-               },
-               py::arg("key"), py::call_guard<py::gil_scoped_release>())
+          .def(
+              "set",
+              [](distributed::Store &self, const std::string &key,
+                 const std::string &value) {
+                std::vector<uint8_t> data(value.begin(), value.end());
+                self.set(key, data);
+              },
+              py::arg("key"), py::arg("value"),
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "get",
+              [](distributed::Store &self,
+                 const std::string &key) -> py::bytes {
+                auto data = self.get(key);
+                return py::bytes(reinterpret_cast<char *>(data.data()),
+                                 data.size());
+              },
+              py::arg("key"), py::call_guard<py::gil_scoped_release>())
           .def("add", &distributed::Store::add,
                py::call_guard<py::gil_scoped_release>())
           .def("wait", &distributed::Store::wait,
diff --git a/paddle/fluid/pybind/communicator_py.cc b/paddle/fluid/pybind/communicator_py.cc
index 723d7f31972..0cb5aa6ef70 100644
--- a/paddle/fluid/pybind/communicator_py.cc
+++ b/paddle/fluid/pybind/communicator_py.cc
@@ -15,16 +15,17 @@ limitations under the License. */
 #include "paddle/fluid/pybind/communicator_py.h"
 
 #include <Python.h>
+
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/program_desc.h"
-#include "pybind11/pybind11.h"
 
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/distributed/communicator.h"
 #include "paddle/fluid/operators/distributed/large_scale_kv.h"
 #include "paddle/fluid/operators/distributed/ps/service/communicator/communicator_common.h"
+#include "pybind11/pybind11.h"
 
 namespace py = pybind11;
 
diff --git a/paddle/fluid/pybind/compatible.cc b/paddle/fluid/pybind/compatible.cc
index cfe87a86cf0..013d0cc0c60 100644
--- a/paddle/fluid/pybind/compatible.cc
+++ b/paddle/fluid/pybind/compatible.cc
@@ -13,23 +13,25 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/compatible.h"
+
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 
 namespace py = pybind11;
 
-using paddle::framework::compatible::OpAttrVariantT;
-using paddle::framework::compatible::OpUpdateInfo;
 using paddle::framework::compatible::OpAttrInfo;
-using paddle::framework::compatible::OpInputOutputInfo;
+using paddle::framework::compatible::OpAttrVariantT;
 using paddle::framework::compatible::OpBugfixInfo;
-using paddle::framework::compatible::OpUpdateType;
-using paddle::framework::compatible::OpUpdateBase;
-using paddle::framework::compatible::OpVersionDesc;
 using paddle::framework::compatible::OpCheckpoint;
+using paddle::framework::compatible::OpInputOutputInfo;
+using paddle::framework::compatible::OpUpdateBase;
+using paddle::framework::compatible::OpUpdateInfo;
+using paddle::framework::compatible::OpUpdateType;
 using paddle::framework::compatible::OpVersion;
+using paddle::framework::compatible::OpVersionDesc;
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 8b48d0b4e44..89a3904d000 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/const_value.h"
+
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/pybind/crypto.cc b/paddle/fluid/pybind/crypto.cc
index 8fbf395bf18..07a9e4021ce 100644
--- a/paddle/fluid/pybind/crypto.cc
+++ b/paddle/fluid/pybind/crypto.cc
@@ -97,11 +97,12 @@ void BindAESCipher(py::module* m) {
 void BindCipherFactory(py::module* m) {
   py::class_<CipherFactory>(*m, "CipherFactory")
       .def(py::init<>())
-      .def_static("create_cipher",
-                  [](const std::string& config_file) {
-                    return CipherFactory::CreateCipher(config_file);
-                  },
-                  py::arg("config_file") = std::string());
+      .def_static(
+          "create_cipher",
+          [](const std::string& config_file) {
+            return CipherFactory::CreateCipher(config_file);
+          },
+          py::arg("config_file") = std::string());
 }
 
 void BindCipherUtils(py::module* m) {
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 64c145c94f9..54080d5e096 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -12,13 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/pybind/cuda_streams_py.h"
+
 #include <string>
 #include <vector>
 
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
-#include "paddle/fluid/pybind/cuda_streams_py.h"
 
 namespace py = pybind11;
 
@@ -28,29 +29,31 @@ void BindCudaStream(py::module *m_ptr) {
   auto &m = *m_ptr;
 
   // Bind Methods
-  m.def("_get_current_stream",
-        [](int deviceId) {
+  m.def(
+      "_get_current_stream",
+      [](int deviceId) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-          return paddle::platform::stream::get_current_stream(deviceId);
+        return paddle::platform::stream::get_current_stream(deviceId);
 #else
-          PADDLE_THROW(platform::errors::Unavailable(
-              "Paddle is not compiled with CUDA. Cannot visit cuda current"
-              "stream."));
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CUDA. Cannot visit cuda current"
+            "stream."));
 #endif
-        },
-        py::return_value_policy::reference);
+      },
+      py::return_value_policy::reference);
 
-  m.def("_set_current_stream",
-        [](paddle::platform::stream::CUDAStream &stream) {
+  m.def(
+      "_set_current_stream",
+      [](paddle::platform::stream::CUDAStream &stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-          return paddle::platform::stream::set_current_stream(&stream);
+        return paddle::platform::stream::set_current_stream(&stream);
 #else
-          PADDLE_THROW(platform::errors::Unavailable(
-              "Paddle is not compiled with CUDA. Cannot set cuda current "
-              "stream."));
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CUDA. Cannot set cuda current "
+            "stream."));
 #endif
-        },
-        py::return_value_policy::reference);
+      },
+      py::return_value_policy::reference);
 
   m.def("_device_synchronize", [](int device_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -94,12 +97,13 @@ void BindCudaStream(py::module *m_ptr) {
 
   )DOC")
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      .def("wait_event",
-           [](paddle::platform::stream::CUDAStream &self,
-              paddle::platform::CudaEvent &event) {
-             self.WaitEvent(event.GetRawCudaEvent());
-           },
-           R"DOC(
+      .def(
+          "wait_event",
+          [](paddle::platform::stream::CUDAStream &self,
+             paddle::platform::CudaEvent &event) {
+            self.WaitEvent(event.GetRawCudaEvent());
+          },
+          R"DOC(
       Makes all future work submitted to stream wait for all work captured in event.
 
       Parameters:
@@ -115,15 +119,16 @@ void BindCudaStream(py::module *m_ptr) {
           s.wait_event(event)
 
            )DOC")
-      .def("wait_stream",
-           [](paddle::platform::stream::CUDAStream &self,
-              paddle::platform::stream::CUDAStream &stream) {
-             paddle::platform::CudaEvent event;
-             event.Record(stream.raw_stream());
-
-             self.WaitEvent(event.GetRawCudaEvent());
-           },
-           R"DOC(
+      .def(
+          "wait_stream",
+          [](paddle::platform::stream::CUDAStream &self,
+             paddle::platform::stream::CUDAStream &stream) {
+            paddle::platform::CudaEvent event;
+            event.Record(stream.raw_stream());
+
+            self.WaitEvent(event.GetRawCudaEvent());
+          },
+          R"DOC(
       Synchronizes with the given stream.
 
       Parameters:
@@ -139,11 +144,12 @@ void BindCudaStream(py::module *m_ptr) {
             s1.wait_stream(s2)
 
            )DOC")
-      .def("query",
-           [](paddle::platform::stream::CUDAStream &self) {
-             return self.Query();
-           },
-           R"DOC(
+      .def(
+          "query",
+          [](paddle::platform::stream::CUDAStream &self) {
+            return self.Query();
+          },
+          R"DOC(
       Return the status whether if all operations in stream have completed.
 
       Returns: A boolean value.
@@ -157,11 +163,12 @@ void BindCudaStream(py::module *m_ptr) {
             is_done = s.query()
 
            )DOC")
-      .def("synchronize",
-           [](paddle::platform::stream::CUDAStream &self) {
-             self.Synchronize();
-           },
-           R"DOC(
+      .def(
+          "synchronize",
+          [](paddle::platform::stream::CUDAStream &self) {
+            self.Synchronize();
+          },
+          R"DOC(
       Waits for stream tasks to complete.
 
       Examples:
@@ -173,16 +180,17 @@ void BindCudaStream(py::module *m_ptr) {
             s.synchronize()
 
            )DOC")
-      .def("record_event",
-           [](paddle::platform::stream::CUDAStream &self,
-              paddle::platform::CudaEvent *event) {
-             if (event == nullptr) {
-               event = new paddle::platform::CudaEvent();
-             }
-             event->Record(self.raw_stream());
-             return event;
-           },
-           R"DOC(
+      .def(
+          "record_event",
+          [](paddle::platform::stream::CUDAStream &self,
+             paddle::platform::CudaEvent *event) {
+            if (event == nullptr) {
+              event = new paddle::platform::CudaEvent();
+            }
+            event->Record(self.raw_stream());
+            return event;
+          },
+          R"DOC(
       Record a CUDA event in the stream.
 
       Parameters:
@@ -201,7 +209,7 @@ void BindCudaStream(py::module *m_ptr) {
             event = s.record_event()
 
            )DOC",
-           py::arg("event") = nullptr)
+          py::arg("event") = nullptr)
       .def_property_readonly(
           "cuda_stream",
           [](paddle::platform::stream::CUDAStream &self) {
@@ -225,32 +233,33 @@ void BindCudaStream(py::module *m_ptr) {
 
            )DOC")
 #endif
-      .def("__init__",
-           [](paddle::platform::stream::CUDAStream &self,
-              platform::CUDAPlace *device, int priority) {
+      .def(
+          "__init__",
+          [](paddle::platform::stream::CUDAStream &self,
+             platform::CUDAPlace *device, int priority) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-             if (priority != 1 && priority != 2) {
-               PADDLE_THROW(platform::errors::InvalidArgument(
-                   "Priority should be 1(high) or 2(normal) "));
-             }
-             auto prio = paddle::platform::stream::Priority(priority);
-             auto stream_flag =
-                 paddle::platform::stream::StreamFlag::kStreamNonBlocking;
-
-             if (device == nullptr) {
-               int curr_device_id = platform::GetCurrentDeviceId();
-               auto device_tmp = platform::CUDAPlace(curr_device_id);
-               device = &device_tmp;
-             }
-
-             new (&self) paddle::platform::stream::CUDAStream(*device, prio,
-                                                              stream_flag);
+            if (priority != 1 && priority != 2) {
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "Priority should be 1(high) or 2(normal) "));
+            }
+            auto prio = paddle::platform::stream::Priority(priority);
+            auto stream_flag =
+                paddle::platform::stream::StreamFlag::kStreamNonBlocking;
+
+            if (device == nullptr) {
+              int curr_device_id = platform::GetCurrentDeviceId();
+              auto device_tmp = platform::CUDAPlace(curr_device_id);
+              device = &device_tmp;
+            }
+
+            new (&self) paddle::platform::stream::CUDAStream(*device, prio,
+                                                             stream_flag);
 #else
             PADDLE_THROW(platform::errors::Unavailable(
         "Class CUDAStream can only be initialized on the GPU platform."));
 #endif
-           },
-           py::arg("device") = nullptr, py::arg("priority") = 2)
+          },
+          py::arg("device") = nullptr, py::arg("priority") = 2)
       .def(
           "__init__",
           [](paddle::platform::stream::CUDAStream &self, int device,
@@ -315,15 +324,16 @@ void BindCudaStream(py::module *m_ptr) {
 
   )DOC")
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      .def("record",
-           [](paddle::platform::CudaEvent &self,
-              paddle::platform::stream::CUDAStream *stream) {
-             if (stream == nullptr) {
-               stream = paddle::platform::stream::get_current_stream(-1);
-             }
-             self.Record(stream->raw_stream());
-           },
-           R"DOC(
+      .def(
+          "record",
+          [](paddle::platform::CudaEvent &self,
+             paddle::platform::stream::CUDAStream *stream) {
+            if (stream == nullptr) {
+              stream = paddle::platform::stream::get_current_stream(-1);
+            }
+            self.Record(stream->raw_stream());
+          },
+          R"DOC(
           Records the event in the given stream.
 
           Parameters:
@@ -338,10 +348,11 @@ void BindCudaStream(py::module *m_ptr) {
               event.record()
     
         )DOC",
-           py::arg("stream") = nullptr)
-      .def("query",
-           [](paddle::platform::CudaEvent &self) { return self.Query(); },
-           R"DOC(
+          py::arg("stream") = nullptr)
+      .def(
+          "query",
+          [](paddle::platform::CudaEvent &self) { return self.Query(); },
+          R"DOC(
           Queries the event's status.
 
           Returns: A boolean which indicates all work currently captured by the event has been completed.
@@ -355,8 +366,9 @@ void BindCudaStream(py::module *m_ptr) {
                 is_done = event.query()
 
            )DOC")
-      .def("synchronize",
-           [](paddle::platform::CudaEvent &self) { self.Synchronize(); }, R"DOC(
+      .def(
+          "synchronize",
+          [](paddle::platform::CudaEvent &self) { self.Synchronize(); }, R"DOC(
             Waits for an event to complete.
 
             Examples:
@@ -369,22 +381,23 @@ void BindCudaStream(py::module *m_ptr) {
 
            )DOC")
 #endif
-      .def("__init__",
-           [](paddle::platform::CudaEvent &self, bool enable_timing,
-              bool blocking, bool interprocess) {
+      .def(
+          "__init__",
+          [](paddle::platform::CudaEvent &self, bool enable_timing,
+             bool blocking, bool interprocess) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-             unsigned int flags = platform::GenerateDeviceEventFlag(
-                 enable_timing, blocking, interprocess);
-             new (&self) paddle::platform::CudaEvent(flags);
+            unsigned int flags = platform::GenerateDeviceEventFlag(
+                enable_timing, blocking, interprocess);
+            new (&self) paddle::platform::CudaEvent(flags);
 #else
-             PADDLE_THROW(platform::errors::Unavailable(
-                 "Class CUDAEvent can only be initialized on the GPU "
-                 "platform."));
+            PADDLE_THROW(platform::errors::Unavailable(
+                "Class CUDAEvent can only be initialized on the GPU "
+                "platform."));
 
 #endif
-           },
-           py::arg("enable_timing") = false, py::arg("blocking") = false,
-           py::arg("interprocess") = false);
+          },
+          py::arg("enable_timing") = false, py::arg("blocking") = false,
+          py::arg("interprocess") = false);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index 5e2274cb651..700bd458a58 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/async_executor.h"
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 6636fc8aca5..3d1a81da6f3 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -109,132 +109,141 @@ void BindDistributed(py::module *m) {
           .def("rank", &distributed::ProcessGroup::GetRank)
           .def("size", &distributed::ProcessGroup::GetSize)
           .def("name", &distributed::ProcessGroup::GetBackendName)
-          .def("allreduce",
-               [](distributed::ProcessGroup &self, py::handle py_tensor,
-                  distributed::ReduceOp op) {
-                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
-                 distributed::AllreduceOptions opts;
-                 opts.reduce_op = op;
-                 auto dense =
-                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                 std::vector<phi::DenseTensor> tensors = {*dense};
-                 return self.AllReduce(tensors, tensors, opts);
-               },
-               py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM,
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("broadcast",
-               [](distributed::ProcessGroup &self, py::handle py_tensor,
-                  int source_rank) {
-                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
-                 distributed::BroadcastOptions opts;
-                 opts.source_rank = source_rank;
-                 auto dense =
-                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                 std::vector<phi::DenseTensor> tensors = {*dense};
-                 return self.Broadcast(tensors, tensors, opts);
-               },
-               py::arg("tensor"), py::arg("source_rank"),
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("barrier",
-               [](distributed::ProcessGroup &self, std::vector<int> place_ids) {
-                 distributed::BarrierOptions opts;
-                 opts.place_ids = place_ids;
-                 return self.Barrier(opts);
-               },
-               py::arg("place_ids") = std::vector<int>{},
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("send",
-               [](distributed::ProcessGroup &self, py::handle py_tensor,
-                  int dst) {
-                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
-                 auto dense =
-                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                 std::vector<phi::DenseTensor> tensors = {*dense};
-                 return self.Send(tensors, dst);
-               },
-               py::arg("tensor"), py::arg("dst"),
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("recv",
-               [](distributed::ProcessGroup &self, py::handle py_tensor,
-                  int src) {
-                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
-                 auto dense =
-                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                 std::vector<phi::DenseTensor> tensors = {*dense};
-                 return self.Recv(tensors, src);
-               },
-               py::arg("tensor"), py::arg("src"),
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("all_gather",
-               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
-                  py::handle py_out_tensor) {
-                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
-                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
-                 auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     in_tensor.impl());
-                 auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     out_tensor.impl());
-                 std::vector<phi::DenseTensor> in_tensors = {*in_dense};
-                 std::vector<phi::DenseTensor> out_tensors = {*out_dense};
-                 return self.AllGather(in_tensors, out_tensors);
-               },
-               py::arg("in"), py::arg("out"),
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("alltoall",
-               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
-                  py::handle py_out_tensor) {
-                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
-                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
-                 auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     in_tensor.impl());
-                 auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     out_tensor.impl());
-                 std::vector<phi::DenseTensor> in_tensors = {*in_dense};
-                 std::vector<phi::DenseTensor> out_tensors = {*out_dense};
-                 return self.AllToAll(in_tensors, out_tensors);
-               },
-               py::arg("in"), py::arg("out"),
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("reduce",
-               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
-                  int dst, distributed::ReduceOp op) {
-                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
-                 distributed::ReduceOptions opts;
-                 opts.reduce_op = op;
-                 opts.root_rank = dst;
-                 auto dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     in_tensor.impl());
-                 std::vector<phi::DenseTensor> tensors = {*dense};
-                 return self.Reduce(tensors, tensors, opts);
-               },
-               py::arg("tensor"), py::arg("dst"),
-               py::arg("op") = distributed::ReduceOp::SUM,
-               py::call_guard<py::gil_scoped_release>())
-
-          .def("scatter",
-               [](distributed::ProcessGroup &self, py::handle py_in_tensor,
-                  py::handle py_out_tensor, int src) {
-                 auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
-                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
-                 distributed::ScatterOptions opts;
-                 opts.root_rank = src;
-                 auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     in_tensor.impl());
-                 auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
-                     out_tensor.impl());
-                 std::vector<phi::DenseTensor> in_tensors = {*in_dense};
-                 std::vector<phi::DenseTensor> out_tensors = {*out_dense};
-                 return self.Scatter(in_tensors, out_tensors, opts);
-               },
-               py::arg("in"), py::arg("out"), py::arg("src"),
-               py::call_guard<py::gil_scoped_release>());
+          .def(
+              "allreduce",
+              [](distributed::ProcessGroup &self, py::handle py_tensor,
+                 distributed::ReduceOp op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                distributed::AllreduceOptions opts;
+                opts.reduce_op = op;
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.AllReduce(tensors, tensors, opts);
+              },
+              py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM,
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "broadcast",
+              [](distributed::ProcessGroup &self, py::handle py_tensor,
+                 int source_rank) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                distributed::BroadcastOptions opts;
+                opts.source_rank = source_rank;
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Broadcast(tensors, tensors, opts);
+              },
+              py::arg("tensor"), py::arg("source_rank"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "barrier",
+              [](distributed::ProcessGroup &self, std::vector<int> place_ids) {
+                distributed::BarrierOptions opts;
+                opts.place_ids = place_ids;
+                return self.Barrier(opts);
+              },
+              py::arg("place_ids") = std::vector<int>{},
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "send",
+              [](distributed::ProcessGroup &self, py::handle py_tensor,
+                 int dst) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Send(tensors, dst);
+              },
+              py::arg("tensor"), py::arg("dst"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "recv",
+              [](distributed::ProcessGroup &self, py::handle py_tensor,
+                 int src) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Recv(tensors, src);
+              },
+              py::arg("tensor"), py::arg("src"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "all_gather",
+              [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                 py::handle py_out_tensor) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> in_tensors = {*in_dense};
+                std::vector<phi::DenseTensor> out_tensors = {*out_dense};
+                return self.AllGather(in_tensors, out_tensors);
+              },
+              py::arg("in"), py::arg("out"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "alltoall",
+              [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                 py::handle py_out_tensor) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> in_tensors = {*in_dense};
+                std::vector<phi::DenseTensor> out_tensors = {*out_dense};
+                return self.AllToAll(in_tensors, out_tensors);
+              },
+              py::arg("in"), py::arg("out"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce",
+              [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                 int dst, distributed::ReduceOp op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                distributed::ReduceOptions opts;
+                opts.reduce_op = op;
+                opts.root_rank = dst;
+                auto dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Reduce(tensors, tensors, opts);
+              },
+              py::arg("tensor"), py::arg("dst"),
+              py::arg("op") = distributed::ReduceOp::SUM,
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "scatter",
+              [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+                 py::handle py_out_tensor, int src) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                distributed::ScatterOptions opts;
+                opts.root_rank = src;
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> in_tensors = {*in_dense};
+                std::vector<phi::DenseTensor> out_tensors = {*out_dense};
+                return self.Scatter(in_tensors, out_tensors, opts);
+              },
+              py::arg("in"), py::arg("out"), py::arg("src"),
+              py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_NCCL)
   py::class_<distributed::ProcessGroupNCCL,
@@ -316,29 +325,31 @@ void BindDistributed(py::module *m) {
                   &ProcessGroupGloo::createDefaultDevice);
 #endif
 
-  m->def("eager_assign_group_by_size",
-         [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
-            std::vector<size_t> group_size_limits,
-            std::vector<int64_t> tensor_indices) {
-           auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
-           return distributed::Eager_AssignGroupBySize(
-               tensors, is_sparse_gradient, group_size_limits, tensor_indices);
-         },
-         py::arg("tensors"), py::arg("is_sparse_gradient"),
-         py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
-         py::arg("tensor_indices") = std::vector<int64_t>{},
-         py::call_guard<py::gil_scoped_release>());
+  m->def(
+      "eager_assign_group_by_size",
+      [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
+         std::vector<size_t> group_size_limits,
+         std::vector<int64_t> tensor_indices) {
+        auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+        return distributed::Eager_AssignGroupBySize(
+            tensors, is_sparse_gradient, group_size_limits, tensor_indices);
+      },
+      py::arg("tensors"), py::arg("is_sparse_gradient"),
+      py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
+      py::arg("tensor_indices") = std::vector<int64_t>{},
+      py::call_guard<py::gil_scoped_release>());
 
   py::class_<distributed::EagerReducer,
              std::shared_ptr<distributed::EagerReducer>>(*m, "EagerReducer",
                                                          R"DOC()DOC")
       .def(py::init(&CreateEagerReducer))
-      .def("prepare_for_backward",
-           [](distributed::EagerReducer &self, py::handle py_tensors) {
-             auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
-             self.PrepareForBackward(params);
-           },
-           py::arg("tensors"), py::call_guard<py::gil_scoped_release>());
+      .def(
+          "prepare_for_backward",
+          [](distributed::EagerReducer &self, py::handle py_tensors) {
+            auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+            self.PrepareForBackward(params);
+          },
+          py::arg("tensors"), py::call_guard<py::gil_scoped_release>());
 }
 
 }  // end namespace pybind
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index c1b26ee0b79..f9325d1b9ca 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -9,6 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 // disable numpy compile error
+#include "paddle/fluid/pybind/eager.h"
+
 #include <Python.h>
 
 #include <string>
@@ -22,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
@@ -488,45 +489,45 @@ void AutoInitStringTensorByStringTensor(
 }
 
 /** We should have init function with signature:
-   * 1.
-   * def __init__ ()
-   * 2.
-   * def __init__ (
-   * ** dtype: paddle::framework::proto::VarType::Type,
-   * ** dims: vector<int>,
-   * ** name: std::string,
-   * ** type: paddle::framework::proto::VarType::LodTensor,
-   * ** persistable: bool)
-   * 3. (multi-place)
-   * (should have at least one parameter, one parameter equals to case 4, zero
-   * parameter equals to case 1)
-   * def __init__ (
-   * ** value: ndarray,
-   * ** place: paddle::platform::Place,
-   * ** persistable: bool,
-   * ** zero_copy: bool,
-   * ** name: std::string,
-   * ** stop_gradient: bool)
-   * 4.
-   * def __init__ (
-   * ** value: ndarray)
-   * 5.
-   * def __init__ (
-   * ** tensor: Tensor)
-   * 6. (multi-place)
-   * (should have at least one parameter, one parameter equals to case 5, zero
-   * parameter equals to case 1.)
-   * def __init__ (
-   * ** tensor: Tensor,
-   * ** place: paddle::platform::Place,
-   * ** name: std::string)
-   * 7. (multi-place) (should have at least one parameter, one parameter similar
-   * to case 5, zero parameter equals to case 1.)
-   * def __init__ (
-   * ** tensor: FrameworkTensor,
-   * ** place: paddle::platform::Place,
-   * ** name: std::string)
-   *  **/
+ * 1.
+ * def __init__ ()
+ * 2.
+ * def __init__ (
+ * ** dtype: paddle::framework::proto::VarType::Type,
+ * ** dims: vector<int>,
+ * ** name: std::string,
+ * ** type: paddle::framework::proto::VarType::LodTensor,
+ * ** persistable: bool)
+ * 3. (multi-place)
+ * (should have at least one parameter, one parameter equals to case 4, zero
+ * parameter equals to case 1)
+ * def __init__ (
+ * ** value: ndarray,
+ * ** place: paddle::platform::Place,
+ * ** persistable: bool,
+ * ** zero_copy: bool,
+ * ** name: std::string,
+ * ** stop_gradient: bool)
+ * 4.
+ * def __init__ (
+ * ** value: ndarray)
+ * 5.
+ * def __init__ (
+ * ** tensor: Tensor)
+ * 6. (multi-place)
+ * (should have at least one parameter, one parameter equals to case 5, zero
+ * parameter equals to case 1.)
+ * def __init__ (
+ * ** tensor: Tensor,
+ * ** place: paddle::platform::Place,
+ * ** name: std::string)
+ * 7. (multi-place) (should have at least one parameter, one parameter similar
+ * to case 5, zero parameter equals to case 1.)
+ * def __init__ (
+ * ** tensor: FrameworkTensor,
+ * ** place: paddle::platform::Place,
+ * ** name: std::string)
+ *  **/
 int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
   EAGER_TRY
   // set a flag to record use kwargs or not
@@ -828,37 +829,37 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
 }
 
 /** We should have init function with signature:
-   * 1.
-   * def __init__ ()
-   *
-   * 2.
-   * def __init__ (
-   * ** dims: vector<int>,
-   * ** name: std::string)
-   *
-   * 3.
-   * (should have at least one parameter, one parameter equals to case 4, zero
-   * parameter equals to case 1)
-   * def __init__ (
-   * ** value: ndarray,
-   * ** zero_copy: bool,
-   * ** name: std::string)
-   *
-   * 4.
-   * def __init__ (
-   * ** value: ndarray)
-   *
-   * 5.
-   * def __init__ (
-   * ** tensor: Tensor)
-   *
-   * 6.
-   * (should have at least one parameter, one parameter equals to case 5, zero
-   * parameter equals to case 1.)
-   * def __init__ (
-   * ** tensor: Tensor,
-   * ** name: std::string)
-   * **/
+ * 1.
+ * def __init__ ()
+ *
+ * 2.
+ * def __init__ (
+ * ** dims: vector<int>,
+ * ** name: std::string)
+ *
+ * 3.
+ * (should have at least one parameter, one parameter equals to case 4, zero
+ * parameter equals to case 1)
+ * def __init__ (
+ * ** value: ndarray,
+ * ** zero_copy: bool,
+ * ** name: std::string)
+ *
+ * 4.
+ * def __init__ (
+ * ** value: ndarray)
+ *
+ * 5.
+ * def __init__ (
+ * ** tensor: Tensor)
+ *
+ * 6.
+ * (should have at least one parameter, one parameter equals to case 5, zero
+ * parameter equals to case 1.)
+ * def __init__ (
+ * ** tensor: Tensor,
+ * ** name: std::string)
+ * **/
 int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
   // set a flag to record use kwargs or not
   bool flag_kwargs = false;
@@ -916,8 +917,9 @@ int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
       // case 1
       VLOG(6) << "Calling case1's string initializer.";
       EmptyStringTensorInitializer(
-          py_tensor_ptr, egr::Controller::Instance().GenerateUniqueName(
-                             "generated_string_tensor"),
+          py_tensor_ptr,
+          egr::Controller::Instance().GenerateUniqueName(
+              "generated_string_tensor"),
           egr::Controller::Instance().GetExpectedPlace());
       return 0;
     } else {
diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h
index a3eac7ab470..db2b438c3bd 100644
--- a/paddle/fluid/pybind/eager.h
+++ b/paddle/fluid/pybind/eager.h
@@ -11,11 +11,11 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
 
 #include "paddle/fluid/eager/pylayer/py_layer_node.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/eager_custom_python_api.h b/paddle/fluid/pybind/eager_custom_python_api.h
index a3e996dbcbf..df4920a5e69 100644
--- a/paddle/fluid/pybind/eager_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_custom_python_api.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <iostream>
+
 #include "paddle/phi/core/enforce.h"
 
 static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 628e808ef99..c75ac0b52c5 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -20,9 +20,6 @@ typedef SSIZE_T ssize_t;
 #include <string>
 #include <vector>
 
-#include "pybind11/numpy.h"
-#include "pybind11/pybind11.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
@@ -51,6 +48,8 @@ typedef SSIZE_T ssize_t;
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 7831530bff0..ab6b8edd52e 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -21,9 +21,6 @@ typedef SSIZE_T ssize_t;
 #include <unordered_map>
 #include <vector>
 
-#include "pybind11/numpy.h"
-#include "pybind11/pybind11.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
@@ -47,6 +44,8 @@ typedef SSIZE_T ssize_t;
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "pybind11/detail/internals.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/framework/python_headers.h"
@@ -1007,10 +1006,11 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
       PADDLE_ENFORCE_EQ(
           egr::egr_utils_api::IsLeafTensor(self->tensor) &&
               !egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient(),
-          false, platform::errors::InvalidArgument(
-                     "Leaf Tensor (%s) that doesn't stop gradient can't use "
-                     "inplace strategy.",
-                     self->tensor.name()));
+          false,
+          platform::errors::InvalidArgument(
+              "Leaf Tensor (%s) that doesn't stop gradient can't use "
+              "inplace strategy.",
+              self->tensor.name()));
     }
 
     paddle::experimental::Tensor value_tensor;
@@ -1232,9 +1232,10 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args,
                         "Only can register backward hook for leaf Tensor."));
   PADDLE_ENFORCE_EQ(
       !egr::EagerUtils::unsafe_autograd_meta(self->tensor)->StopGradient(),
-      true, platform::errors::InvalidArgument(
-                "Cannot register backward hook on a Tensor that stop "
-                "gradient."));
+      true,
+      platform::errors::InvalidArgument(
+          "Cannot register backward hook on a Tensor that stop "
+          "gradient."));
   PADDLE_ENFORCE(
       grad_node.get() != nullptr,
       paddle::platform::errors::Fatal("Detected NULL grad_node,"
@@ -1667,8 +1668,8 @@ PyMethodDef variable_methods[] = {
      (PyCFunction)(void (*)(void))tensor_method__is_initialized,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_is_dense_tensor_hold_allocation",
-     (PyCFunction)(
-         void (*)(void))tensor_method__is_dense_tensor_hold_allocation,
+     (PyCFunction)(void (*)(
+         void))tensor_method__is_dense_tensor_hold_allocation,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_copy_to", (PyCFunction)(void (*)(void))tensor_method__copy_to,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -1793,8 +1794,8 @@ PyMethodDef string_tensor_variable_methods[] = {
      (PyCFunction)(void (*)(void))tensor_method__is_initialized,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_is_string_tensor_hold_allocation",
-     (PyCFunction)(
-         void (*)(void))tensor_method__is_string_tensor_hold_allocation,
+     (PyCFunction)(void (*)(
+         void))tensor_method__is_string_tensor_hold_allocation,
      METH_VARARGS | METH_KEYWORDS, NULL},
     // TODO(zhoushunjie): Need to add _copy_to, copy_ for StringTensor.
     {NULL, NULL, 0, NULL}};
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index b546aa2d76b..f58f3ce9453 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -486,7 +486,8 @@ int main(int argc, char* argv[]) {
       "\"paddle/fluid/pybind/op_function_common.h\"",
       "\"paddle/fluid/eager/api/generated/fluid_generated/"
       "dygraph_forward_api.h\"",
-      "\"paddle/fluid/pybind/exception.h\"", "<Python.h>"};
+      "\"paddle/fluid/pybind/exception.h\"",
+      "<Python.h>"};
 
   std::ofstream out(argv[1], std::ios::out);
 
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
index 47a5309d691..a0cef6388c1 100644
--- a/paddle/fluid/pybind/eager_py_layer.cc
+++ b/paddle/fluid/pybind/eager_py_layer.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 #include <vector>
 
 #pragma GCC diagnostic ignored "-Wattributes"
-#include "pybind11/pytypes.h"
-
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
@@ -34,6 +32,7 @@ limitations under the License. */
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "pybind11/detail/internals.h"
+#include "pybind11/pytypes.h"
 #pragma GCC diagnostic ignored "-Wwrite-strings"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
 
@@ -323,10 +322,11 @@ PyObject* pylayer_method_apply(PyObject* cls, PyObject* args,
           egr::EagerUtils::autograd_meta(dirty_tensor);
       PADDLE_ENFORCE_EQ(!dirty_tensor_autograd_meta->StopGradient() &&
                             egr::egr_utils_api::IsLeafTensor(*dirty_tensor),
-                        false, paddle::platform::errors::InvalidArgument(
-                                   "Leaf Var (%s) that doesn't stop gradient "
-                                   "can't use inplace strategy.",
-                                   dirty_tensor->name()));
+                        false,
+                        paddle::platform::errors::InvalidArgument(
+                            "Leaf Var (%s) that doesn't stop gradient "
+                            "can't use inplace strategy.",
+                            dirty_tensor->name()));
       dirty_tensor->bump_inplace_version();
       VLOG(3) << "Tensor(" << dirty_tensor->name()
               << ") uses Inplace Strategy.";
@@ -466,16 +466,19 @@ PyMethodDef pylayer_methods[] = {
      METH_O, NULL},
     {NULL, NULL, 0, NULL}};
 
-struct PyGetSetDef pylayer_properties[]{
-    {"container", (getter)tensor_properties_get_container,
-     (setter)tensor_properties_set_container, nullptr, nullptr},
-    {"non_differentiable", (getter)tensor_properties_get_non_differentiable,
-     (setter)tensor_properties_set_non_differentiable, nullptr, nullptr},
-    {"dirty_tensors", (getter)tensor_properties_get_dirty_tensors,
-     (setter)tensor_properties_set_dirty_tensors, nullptr, nullptr},
-    {"materialize_grads", nullptr,
-     (setter)tensor_properties_set_materialize_grads, nullptr, nullptr},
-    {nullptr, nullptr, nullptr, nullptr, nullptr}};
+struct PyGetSetDef pylayer_properties[] {
+  {"container", (getter)tensor_properties_get_container,
+   (setter)tensor_properties_set_container, nullptr, nullptr},
+      {"non_differentiable", (getter)tensor_properties_get_non_differentiable,
+       (setter)tensor_properties_set_non_differentiable, nullptr, nullptr},
+      {"dirty_tensors", (getter)tensor_properties_get_dirty_tensors,
+       (setter)tensor_properties_set_dirty_tensors, nullptr, nullptr},
+      {"materialize_grads", nullptr,
+       (setter)tensor_properties_set_materialize_grads, nullptr, nullptr},
+  {
+    nullptr, nullptr, nullptr, nullptr, nullptr
+  }
+};
 
 void BindEagerPyLayer(PyObject* module) {
   auto heap_type = reinterpret_cast<PyHeapTypeObject*>(
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index efa0fe2cb58..9bcac35037d 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+// clang-format will try to move eager_utils.h in front of other headers
+// according to google c++ style, and that cause compiling problems.
+// clang-format off
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/framework/convert_utils.h"
@@ -31,6 +34,7 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+// clang-format on
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 7f94f6c90e5..beab99877bd 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -16,12 +16,12 @@ typedef SSIZE_T ssize_t;
 #endif
 
 #include <Python.h>
+
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 namespace paddle {
@@ -112,8 +112,9 @@ struct TupleTensorResult {
                   PyObject* args, ssize_t arg_idx) {
     TupleTensorResult<Tuple, N - 1>::Run(out, result, value_idx, args, arg_idx);
     if (N - 1 == value_idx) {
-      PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out),
-                                                 value_idx, args, arg_idx));
+      PyTuple_SET_ITEM(
+          result, N - 1,
+          ToPyObject(std::get<N - 1>(out), value_idx, args, arg_idx));
     } else {
       PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
     }
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 4f25a6f1a5c..934a9ef97fb 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/exception.h"
+
 #include "paddle/phi/api/ext/exception.h"
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 4ffb513671c..25f2c910028 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 #undef _XOPEN_SOURCE
 #endif
 
-#include "paddle/fluid/pybind/fleet_py.h"
-
 #include <map>
 #include <memory>
 #include <string>
@@ -35,17 +33,18 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h"
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
+#include "paddle/fluid/pybind/fleet_py.h"
 
 namespace py = pybind11;
 using paddle::distributed::CommContext;
 using paddle::distributed::Communicator;
+using paddle::distributed::FeatureNode;
 using paddle::distributed::FleetWrapper;
-using paddle::distributed::HeterClient;
-using paddle::distributed::GraphPyService;
 using paddle::distributed::GraphNode;
-using paddle::distributed::GraphPyServer;
 using paddle::distributed::GraphPyClient;
-using paddle::distributed::FeatureNode;
+using paddle::distributed::GraphPyServer;
+using paddle::distributed::GraphPyService;
+using paddle::distributed::HeterClient;
 
 namespace paddle {
 namespace pybind {
@@ -246,13 +245,13 @@ void BindGraphPyClient(py::module* m) {
       .def("bind_local_server", &GraphPyClient::bind_local_server);
 }
 
-using paddle::distributed::TreeIndex;
-using paddle::distributed::IndexWrapper;
 using paddle::distributed::IndexNode;
+using paddle::distributed::IndexWrapper;
+using paddle::distributed::TreeIndex;
 #ifdef PADDLE_WITH_HETERPS
 using paddle::framework::GraphGpuWrapper;
-using paddle::framework::NeighborSampleResult;
 using paddle::framework::NeighborSampleQuery;
+using paddle::framework::NeighborSampleResult;
 using paddle::framework::NodeQueryResult;
 #endif
 
diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc
index af1c3da727d..0e1d4cd76ad 100644
--- a/paddle/fluid/pybind/fleet_wrapper_py.cc
+++ b/paddle/fluid/pybind/fleet_wrapper_py.cc
@@ -46,10 +46,10 @@ void BindFleetWrapper(py::module* m) {
       .def("push_dense", &framework::FleetWrapper::PushDenseVarsSync)
       .def("pull_dense", &framework::FleetWrapper::PullDenseVarsSync)
       .def("init_server", &framework::FleetWrapper::InitServer)
-      .def("run_server", (uint64_t (framework::FleetWrapper::*)(void)) &
+      .def("run_server", (uint64_t(framework::FleetWrapper::*)(void)) &
                              framework::FleetWrapper::RunServer)
-      .def("run_server", (uint64_t (framework::FleetWrapper::*)(  // NOLINT
-                             const std::string&, uint32_t)) &     // NOLINT
+      .def("run_server", (uint64_t(framework::FleetWrapper::*)(  // NOLINT
+                             const std::string&, uint32_t)) &    // NOLINT
                              framework::FleetWrapper::RunServer)
       .def("init_worker", &framework::FleetWrapper::InitWorker)
       .def("init_model", &framework::FleetWrapper::PushDenseParamSync)
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 6bb85da8c46..e456526f844 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -8,9 +8,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/phi/core/generator.h"
 #include <fcntl.h>
 
+#include "paddle/phi/core/generator.h"
+
 #ifdef _POSIX_C_SOURCE
 #undef _POSIX_C_SOURCE
 #endif
diff --git a/paddle/fluid/pybind/gloo_context_py.cc b/paddle/fluid/pybind/gloo_context_py.cc
index 2314ceac76e..b4ee1bcd02b 100644
--- a/paddle/fluid/pybind/gloo_context_py.cc
+++ b/paddle/fluid/pybind/gloo_context_py.cc
@@ -43,13 +43,14 @@ void BindGlooContext(py::module *m) {
   py::class_<platform::GlooParallelStrategy> gloo_parallel_strategy(
       *m, "GlooParallelStrategy", "");
   gloo_parallel_strategy.def(py::init())
-      .def_property("rank_num",
-                    [](const platform::GlooParallelStrategy &self) {
-                      return self.rank_num;
-                    },
-                    [](platform::GlooParallelStrategy &self, int nranks) {
-                      self.rank_num = nranks;
-                    })
+      .def_property(
+          "rank_num",
+          [](const platform::GlooParallelStrategy &self) {
+            return self.rank_num;
+          },
+          [](platform::GlooParallelStrategy &self, int nranks) {
+            self.rank_num = nranks;
+          })
       .def_property(
           "rank",
           [](const platform::GlooParallelStrategy &self) { return self.rank; },
@@ -62,20 +63,22 @@ void BindGlooContext(py::module *m) {
           [](platform::GlooParallelStrategy &self, const std::string &iface) {
             self.iface = iface;
           })
-      .def_property("init_seconds",
-                    [](const platform::GlooParallelStrategy &self) {
-                      return self.init_seconds;
-                    },
-                    [](platform::GlooParallelStrategy &self, int init_seconds) {
-                      self.init_seconds = init_seconds;
-                    })
-      .def_property("run_seconds",
-                    [](const platform::GlooParallelStrategy &self) {
-                      return self.run_seconds;
-                    },
-                    [](platform::GlooParallelStrategy &self, int run_seconds) {
-                      self.run_seconds = run_seconds;
-                    })
+      .def_property(
+          "init_seconds",
+          [](const platform::GlooParallelStrategy &self) {
+            return self.init_seconds;
+          },
+          [](platform::GlooParallelStrategy &self, int init_seconds) {
+            self.init_seconds = init_seconds;
+          })
+      .def_property(
+          "run_seconds",
+          [](const platform::GlooParallelStrategy &self) {
+            return self.run_seconds;
+          },
+          [](platform::GlooParallelStrategy &self, int run_seconds) {
+            self.run_seconds = run_seconds;
+          })
       .def_property(
           "ip_address",
           [](const platform::GlooParallelStrategy &self) {
@@ -83,13 +86,14 @@ void BindGlooContext(py::module *m) {
           },
           [](platform::GlooParallelStrategy &self,
              const std::string &ip_address) { self.ip_address = ip_address; })
-      .def_property("ip_port",
-                    [](const platform::GlooParallelStrategy &self) {
-                      return self.ip_port;
-                    },
-                    [](platform::GlooParallelStrategy &self, int ip_port) {
-                      self.ip_port = ip_port;
-                    });
+      .def_property(
+          "ip_port",
+          [](const platform::GlooParallelStrategy &self) {
+            return self.ip_port;
+          },
+          [](platform::GlooParallelStrategy &self, int ip_port) {
+            self.ip_port = ip_port;
+          });
 
   py::class_<platform::GlooParallelContext> gloo_ctx(*m, "GlooParallelContext");
   gloo_ctx.def(py::init<const platform::GlooParallelStrategy &>())
diff --git a/paddle/fluid/pybind/gloo_context_py.h b/paddle/fluid/pybind/gloo_context_py.h
index 89bd183097b..51f736ed060 100644
--- a/paddle/fluid/pybind/gloo_context_py.h
+++ b/paddle/fluid/pybind/gloo_context_py.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
+
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 954bac00ddb..3de6c64617d 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -159,10 +159,9 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
 // only initialize varbase, but not its tensor.
 static void InitVarBaseOnly(imperative::VarBase *self, const std::string &name,
                             bool persistable = false, int stop_gradient = -1) {
-  auto name_ = name == ""
-                   ? imperative::GetCurrentTracer()->GenerateUniqueName(
-                         "generated_tensor")
-                   : name;
+  auto name_ = name == "" ? imperative::GetCurrentTracer()->GenerateUniqueName(
+                                "generated_tensor")
+                          : name;
 
   VLOG(5) << "Init Tensor as: / name: " << name_
           << " / persistable: " << persistable
@@ -274,10 +273,9 @@ static void InitVarBaseFromTensorWithArgDefault(imperative::VarBase *self,
                                                 const std::string &name) {
   VLOG(4) << "Init VarBase";
   auto place = imperative::GetCurrentTracer()->ExpectedPlace();
-  auto name_ = name == ""
-                   ? imperative::GetCurrentTracer()->GenerateUniqueName(
-                         "generated_tensor")
-                   : name;
+  auto name_ = name == "" ? imperative::GetCurrentTracer()->GenerateUniqueName(
+                                "generated_tensor")
+                          : name;
   new (self) imperative::VarBase(name_);
   self->SetPersistable(false);
   self->SetType(framework::proto::VarType::LOD_TENSOR);
@@ -299,10 +297,9 @@ static void InitVarBaseFromTensorWithArg(imperative::VarBase *self,
                                          const P &place,
                                          const std::string &name) {
   VLOG(4) << "Init VarBase";
-  auto name_ = name == ""
-                   ? imperative::GetCurrentTracer()->GenerateUniqueName(
-                         "generated_tensor")
-                   : name;
+  auto name_ = name == "" ? imperative::GetCurrentTracer()->GenerateUniqueName(
+                                "generated_tensor")
+                          : name;
   new (self) imperative::VarBase(name_);
   self->SetPersistable(false);
   self->SetType(framework::proto::VarType::LOD_TENSOR);
@@ -556,38 +553,39 @@ void BindImperative(py::module *m_ptr) {
       },
       py::return_value_policy::take_ownership);
 
-  m.def("_array_to_share_memory_tensor",
-        [](py::object &obj) {
-          // 1. cast to python array
-          auto array = obj.cast<py::array>();
-          PADDLE_ENFORCE_NE(
-              string::Sprintf("%s", array.dtype()).compare("object"), 0,
-              platform::errors::InvalidArgument(
-                  "Faild to convert input data to a regular ndarray.\n  * "
-                  "Usually this means the input data contains nested "
-                  "lists with different lengths.\n  * Check the reader "
-                  "function passed to 'set_(sample/sample_list/batch)"
-                  "_generator' to locate the data causes this issue."));
-          // 2. construcct LoDTensor
-          framework::LoDTensor t;
-          SetTensorFromPyArray<platform::CPUPlace>(&t, array,
-                                                   platform::CPUPlace(), true);
-          // 3. allocate shared memory
-          void *data_ptr = t.data();
-          size_t data_size = t.numel() * framework::DataTypeSize(t.dtype());
-          auto shared_writer_holder =
-              memory::allocation::AllocateMemoryMapWriterAllocation(data_size);
-          // 4. maintain mmap fd set & backup ipc_name
-          const std::string &ipc_name = shared_writer_holder->ipc_name();
-          memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
-          // 5. copy data & reset holder
-          memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
-                       platform::CPUPlace(), data_ptr, data_size);
-          t.ResetHolder(shared_writer_holder);
-
-          return t;
-        },
-        py::return_value_policy::take_ownership);
+  m.def(
+      "_array_to_share_memory_tensor",
+      [](py::object &obj) {
+        // 1. cast to python array
+        auto array = obj.cast<py::array>();
+        PADDLE_ENFORCE_NE(
+            string::Sprintf("%s", array.dtype()).compare("object"), 0,
+            platform::errors::InvalidArgument(
+                "Faild to convert input data to a regular ndarray.\n  * "
+                "Usually this means the input data contains nested "
+                "lists with different lengths.\n  * Check the reader "
+                "function passed to 'set_(sample/sample_list/batch)"
+                "_generator' to locate the data causes this issue."));
+        // 2. construcct LoDTensor
+        framework::LoDTensor t;
+        SetTensorFromPyArray<platform::CPUPlace>(&t, array,
+                                                 platform::CPUPlace(), true);
+        // 3. allocate shared memory
+        void *data_ptr = t.data();
+        size_t data_size = t.numel() * framework::DataTypeSize(t.dtype());
+        auto shared_writer_holder =
+            memory::allocation::AllocateMemoryMapWriterAllocation(data_size);
+        // 4. maintain mmap fd set & backup ipc_name
+        const std::string &ipc_name = shared_writer_holder->ipc_name();
+        memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+        // 5. copy data & reset holder
+        memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
+                     platform::CPUPlace(), data_ptr, data_size);
+        t.ResetHolder(shared_writer_holder);
+
+        return t;
+      },
+      py::return_value_policy::take_ownership);
 
   m.def("_remove_tensor_list_mmap_fds", [](py::list &tensor_list) {
     for (size_t i = 0; i < tensor_list.size(); ++i) {
@@ -1089,31 +1087,32 @@ void BindImperative(py::module *m_ptr) {
                      self.Name()));
              return var->CurrentInplaceVersion();
            })
-      .def("_bump_inplace_version",
-           [](std::shared_ptr<imperative::VarBase> &self) {
-             // NOTE(liym27): _bump_inplace_version is only used for inplace
-             // operation
-             self->BumpInplaceVersion();
-           },
-           R"DOC(
+      .def(
+          "_bump_inplace_version",
+          [](std::shared_ptr<imperative::VarBase> &self) {
+            // NOTE(liym27): _bump_inplace_version is only used for inplace
+            // operation
+            self->BumpInplaceVersion();
+          },
+          R"DOC(
         **Notes**:
             **This API is ONLY available in Dygraph mode.**
             **This is a very low level API. Users should not use it directly. **
          Bump the version whenever the Tensor is modified through an inplace operation.
             )DOC")
-      .def("numpy",
+      .def(
+          "numpy",
 
-           [](imperative::VarBase &self) -> py::array {
-             const auto &tensor =
-                 self.MutableVar()->Get<framework::LoDTensor>();
-             PADDLE_ENFORCE_EQ(
-                 tensor.IsInitialized(), true,
-                 platform::errors::InvalidArgument(
-                     "Tensor of %s is Empty, please check if it has no data.",
-                     self.Name()));
-             return TensorToPyArray(tensor, true);
-           },
-           R"DOC(
+          [](imperative::VarBase &self) -> py::array {
+            const auto &tensor = self.MutableVar()->Get<framework::LoDTensor>();
+            PADDLE_ENFORCE_EQ(
+                tensor.IsInitialized(), true,
+                platform::errors::InvalidArgument(
+                    "Tensor of %s is Empty, please check if it has no data.",
+                    self.Name()));
+            return TensorToPyArray(tensor, true);
+          },
+          R"DOC(
         Returns a numpy array shows the value of current Tensor.
         
         Returns:
@@ -1133,68 +1132,69 @@ void BindImperative(py::module *m_ptr) {
                 x = linear(data)
                 print(x.numpy())
        )DOC")
-      .def("detach",
-           [](const imperative::VarBase
-                  &self) -> std::shared_ptr<imperative::VarBase> {
-             PADDLE_ENFORCE_EQ(
-                 self.Var().IsInitialized(), true,
-                 platform::errors::InvalidArgument(
-                     "Tensor %s has not been initialized!", self.Name()));
+      .def(
+          "detach",
+          [](const imperative::VarBase &self)
+              -> std::shared_ptr<imperative::VarBase> {
+            PADDLE_ENFORCE_EQ(
+                self.Var().IsInitialized(), true,
+                platform::errors::InvalidArgument(
+                    "Tensor %s has not been initialized!", self.Name()));
 
-             PADDLE_ENFORCE_EQ(
-                 self.Var().IsType<framework::LoDTensor>() ||
-                     self.Var().IsType<phi::SelectedRows>(),
-                 true,
-                 platform::errors::InvalidArgument(
-                     "Type of Tensor[%s] must be LoDTensor or SelectedRows!",
-                     self.Name()));
+            PADDLE_ENFORCE_EQ(
+                self.Var().IsType<framework::LoDTensor>() ||
+                    self.Var().IsType<phi::SelectedRows>(),
+                true,
+                platform::errors::InvalidArgument(
+                    "Type of Tensor[%s] must be LoDTensor or SelectedRows!",
+                    self.Name()));
 
-             auto detach_var = std::make_shared<imperative::VarBase>(
-                 true, "detach_" + self.Name());
+            auto detach_var = std::make_shared<imperative::VarBase>(
+                true, "detach_" + self.Name());
 
-             detach_var->SetPersistable(self.Persistable());
-             detach_var->SetType(self.Type());
-             detach_var->SetDataType(self.DataType());
+            detach_var->SetPersistable(self.Persistable());
+            detach_var->SetType(self.Type());
+            detach_var->SetDataType(self.DataType());
 
-             if (self.Var().IsType<framework::LoDTensor>()) {
-               const auto &origin_tensor =
-                   self.Var().Get<framework::LoDTensor>();
-               PADDLE_ENFORCE_EQ(
-                   origin_tensor.IsInitialized(), true,
-                   platform::errors::InvalidArgument(
-                       "Tensor %s has not been initialized!", self.Name()));
-
-               auto *detach_tensor =
-                   detach_var->MutableVar()->GetMutable<framework::LoDTensor>();
-               detach_tensor->ShareDataWith(origin_tensor);
-               // NOTE(liym27): Call ShareInplaceVersionCounterWith to share the
-               // same TensorInplaceVersion, which is used to check whether
-               // inplace
-               // operations are correct.
-               detach_tensor->ShareInplaceVersionCounterWith(origin_tensor);
-             } else {
-               const auto &origin_selected_rows =
-                   self.Var().Get<phi::SelectedRows>();
-               PADDLE_ENFORCE_EQ(
-                   origin_selected_rows.value().IsInitialized(), true,
-                   platform::errors::InvalidArgument(
-                       "Tensor %s has not been initialized!", self.Name()));
-
-               auto *detach_selected_rows =
-                   detach_var->MutableVar()->GetMutable<phi::SelectedRows>();
-               detach_selected_rows->set_height(origin_selected_rows.height());
-               detach_selected_rows->set_rows(origin_selected_rows.rows());
-               detach_selected_rows->mutable_value()->ShareDataWith(
-                   origin_selected_rows.value());
-               detach_selected_rows->mutable_value()
-                   ->ShareInplaceVersionCounterWith(
-                       origin_selected_rows.value());
-             }
-             VLOG(3) << "The detached Tensor(" << detach_var->Name()
-                     << ") share data with " << self.Name();
-             return detach_var;
-           },
-           py::return_value_policy::take_ownership, R"DOC(
+            if (self.Var().IsType<framework::LoDTensor>()) {
+              const auto &origin_tensor =
+                  self.Var().Get<framework::LoDTensor>();
+              PADDLE_ENFORCE_EQ(
+                  origin_tensor.IsInitialized(), true,
+                  platform::errors::InvalidArgument(
+                      "Tensor %s has not been initialized!", self.Name()));
+
+              auto *detach_tensor =
+                  detach_var->MutableVar()->GetMutable<framework::LoDTensor>();
+              detach_tensor->ShareDataWith(origin_tensor);
+              // NOTE(liym27): Call ShareInplaceVersionCounterWith to share the
+              // same TensorInplaceVersion, which is used to check whether
+              // inplace
+              // operations are correct.
+              detach_tensor->ShareInplaceVersionCounterWith(origin_tensor);
+            } else {
+              const auto &origin_selected_rows =
+                  self.Var().Get<phi::SelectedRows>();
+              PADDLE_ENFORCE_EQ(
+                  origin_selected_rows.value().IsInitialized(), true,
+                  platform::errors::InvalidArgument(
+                      "Tensor %s has not been initialized!", self.Name()));
+
+              auto *detach_selected_rows =
+                  detach_var->MutableVar()->GetMutable<phi::SelectedRows>();
+              detach_selected_rows->set_height(origin_selected_rows.height());
+              detach_selected_rows->set_rows(origin_selected_rows.rows());
+              detach_selected_rows->mutable_value()->ShareDataWith(
+                  origin_selected_rows.value());
+              detach_selected_rows->mutable_value()
+                  ->ShareInplaceVersionCounterWith(
+                      origin_selected_rows.value());
+            }
+            VLOG(3) << "The detached Tensor(" << detach_var->Name()
+                    << ") share data with " << self.Name();
+            return detach_var;
+          },
+          py::return_value_policy::take_ownership, R"DOC(
 
         Returns a new Tensor, detached from the current graph.
         It will share data with origin Tensor and always doesn't have a Tensor copy.
@@ -1256,23 +1256,23 @@ void BindImperative(py::module *m_ptr) {
       .def("_gradient_set_empty", &imperative::VarBase::_GradientSetEmpty,
            py::arg("set_is_empty") = true)
       .def("_is_gradient_set_empty", &imperative::VarBase::_IsGradientSetEmpty)
-      .def("clone",
-           [](std::shared_ptr<imperative::VarBase> &self) {
-             const auto &tensor = self->Var().Get<framework::LoDTensor>();
-             PADDLE_ENFORCE_EQ(
-                 tensor.IsInitialized(), true,
-                 platform::errors::InvalidArgument(
-                     "%s has not been initialized", self->Name()));
-             auto tracer = imperative::GetCurrentTracer();
-             auto new_var = std::make_shared<imperative::VarBase>(
-                 true, tracer->GenerateUniqueName(self->Name() + "_clone"));
-             framework::AttributeMap attrs;
-             imperative::NameVarBaseMap ins = {{"X", {self}}};
-             imperative::NameVarBaseMap outs = {{"Out", {new_var}}};
-             tracer->TraceOp("assign", ins, outs, attrs);
-             return new_var;
-           },
-           py::return_value_policy::copy, R"DOC(
+      .def(
+          "clone",
+          [](std::shared_ptr<imperative::VarBase> &self) {
+            const auto &tensor = self->Var().Get<framework::LoDTensor>();
+            PADDLE_ENFORCE_EQ(tensor.IsInitialized(), true,
+                              platform::errors::InvalidArgument(
+                                  "%s has not been initialized", self->Name()));
+            auto tracer = imperative::GetCurrentTracer();
+            auto new_var = std::make_shared<imperative::VarBase>(
+                true, tracer->GenerateUniqueName(self->Name() + "_clone"));
+            framework::AttributeMap attrs;
+            imperative::NameVarBaseMap ins = {{"X", {self}}};
+            imperative::NameVarBaseMap outs = {{"Out", {new_var}}};
+            tracer->TraceOp("assign", ins, outs, attrs);
+            return new_var;
+          },
+          py::return_value_policy::copy, R"DOC(
 
         Returns a new Tensor, which is clone of origin Tensor, and it remains in the current graph.
         It will always have a Tensor copy.
@@ -1305,11 +1305,12 @@ void BindImperative(py::module *m_ptr) {
               print(x.grad)          # None
        )DOC")
       .def("_grad_name", &imperative::VarBase::GradVarName)
-      .def("_grad_value",
-           [](imperative::VarBase &self) {
-             return self.MutableGradVar()->Get<framework::LoDTensor>();
-           },
-           py::return_value_policy::reference)
+      .def(
+          "_grad_value",
+          [](imperative::VarBase &self) {
+            return self.MutableGradVar()->Get<framework::LoDTensor>();
+          },
+          py::return_value_policy::reference)
       .def("_set_grad_type",
            [](imperative::VarBase &self, framework::proto::VarType::Type type) {
              self.MutableGradVarBase()->SetType(type);
@@ -1337,26 +1338,27 @@ void BindImperative(py::module *m_ptr) {
                }
              }
            })
-      .def("_grad_ivar",
-           [](const imperative::VarBase &self) {
-             auto &grad_var = self.GradVarBase();
-
-             if (grad_var && grad_var->Var().IsInitialized()) {
-               auto *tensor =
-                   grad_var->MutableVar()->IsType<framework::LoDTensor>()
-                       ? grad_var->MutableVar()
-                             ->GetMutable<framework::LoDTensor>()
-                       : grad_var->MutableVar()
-                             ->GetMutable<phi::SelectedRows>()
-                             ->mutable_value();
-
-               if (tensor->IsInitialized()) {
-                 return grad_var;
-               }
-             }
-             return std::shared_ptr<imperative::VarBase>(nullptr);
-           },
-           py::return_value_policy::copy)
+      .def(
+          "_grad_ivar",
+          [](const imperative::VarBase &self) {
+            auto &grad_var = self.GradVarBase();
+
+            if (grad_var && grad_var->Var().IsInitialized()) {
+              auto *tensor =
+                  grad_var->MutableVar()->IsType<framework::LoDTensor>()
+                      ? grad_var->MutableVar()
+                            ->GetMutable<framework::LoDTensor>()
+                      : grad_var->MutableVar()
+                            ->GetMutable<phi::SelectedRows>()
+                            ->mutable_value();
+
+              if (tensor->IsInitialized()) {
+                return grad_var;
+              }
+            }
+            return std::shared_ptr<imperative::VarBase>(nullptr);
+          },
+          py::return_value_policy::copy)
       .def("_set_grad_ivar",
            [](imperative::VarBase &self, imperative::VarBase &grad) {
              self.SetGradVarBase(grad);
@@ -1365,13 +1367,14 @@ void BindImperative(py::module *m_ptr) {
            [](imperative::VarBase &self) {
              return self.Var().IsType<phi::SelectedRows>();
            })
-      .def("_allreduce",
-           [](imperative::VarBase &self,
-              const imperative::ParallelStrategy &strategy) {
-             if (strategy.nranks_ > 1) {
+      .def(
+          "_allreduce",
+          [](imperative::VarBase &self,
+             const imperative::ParallelStrategy &strategy) {
+            if (strategy.nranks_ > 1) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #if NCCL_VERSION_CODE >= 2212
-               imperative::AllReduce(self.Var(), self.MutableVar(), strategy);
+              imperative::AllReduce(self.Var(), self.MutableVar(), strategy);
 #else
                if (!self.Var().IsType<phi::SelectedRows>()) {
                  imperative::AllReduce(self.Var(), self.MutableVar(), strategy);
@@ -1388,9 +1391,9 @@ void BindImperative(py::module *m_ptr) {
                    "Imperative allreduce is not supported when paddle is "
                    "not compiled with NCCL."));
 #endif  // PADDLE_WITH_NCCL or PADDLE_WITH_RCCL
-             }
-           },
-           py::call_guard<py::gil_scoped_release>())
+            }
+          },
+          py::call_guard<py::gil_scoped_release>())
       .def("_register_grad_hook",
            [](imperative::VarBase &self, const py::handle &hook) {
              PADDLE_ENFORCE_EQ(
@@ -1425,22 +1428,23 @@ void BindImperative(py::module *m_ptr) {
                    std::make_shared<std::function<void()>>(py_func));
              }
            })
-      .def("_register_backward_hook",
-           [](imperative::VarBase &self, const py::handle &hook) {
-             PADDLE_ENFORCE_EQ(
-                 self.IsLeaf(), true,
-                 platform::errors::InvalidArgument(
-                     "Only can register backward hook for leaf Tensor."));
-             PADDLE_ENFORCE_EQ(
-                 !self.OverridedStopGradient() && self.HasGradVar(), true,
-                 platform::errors::InvalidArgument(
-                     "Cannot register backward hook on a Tensor that stop "
-                     "gradient or without gradient."));
-             auto py_func = PyObjectCast<std::function<void()>>(hook.ptr());
-             self.GradVarBase()->AddVoidHook(
-                 std::make_shared<std::function<void()>>(py_func));
-           },
-           R"DOC(
+      .def(
+          "_register_backward_hook",
+          [](imperative::VarBase &self, const py::handle &hook) {
+            PADDLE_ENFORCE_EQ(
+                self.IsLeaf(), true,
+                platform::errors::InvalidArgument(
+                    "Only can register backward hook for leaf Tensor."));
+            PADDLE_ENFORCE_EQ(
+                !self.OverridedStopGradient() && self.HasGradVar(), true,
+                platform::errors::InvalidArgument(
+                    "Cannot register backward hook on a Tensor that stop "
+                    "gradient or without gradient."));
+            auto py_func = PyObjectCast<std::function<void()>>(hook.ptr());
+            self.GradVarBase()->AddVoidHook(
+                std::make_shared<std::function<void()>>(py_func));
+          },
+          R"DOC(
              Registers a backward hook for current Tensor.
 
              This hook will be called every time the gradient of current Tensor has been fully calculated.
@@ -1461,17 +1465,18 @@ void BindImperative(py::module *m_ptr) {
              Returns:
                  None
            )DOC")
-      .def("cpu",
-           [](const std::shared_ptr<imperative::VarBase> &self) {
-             if (platform::is_cpu_place(self->Place())) {
-               return self;
-             } else {
-               auto new_var = self->NewVarBase(platform::CPUPlace(), true);
-               new_var->SetOverridedStopGradient(self->OverridedStopGradient());
-               return new_var;
-             }
-           },
-           R"DOC(
+      .def(
+          "cpu",
+          [](const std::shared_ptr<imperative::VarBase> &self) {
+            if (platform::is_cpu_place(self->Place())) {
+              return self;
+            } else {
+              auto new_var = self->NewVarBase(platform::CPUPlace(), true);
+              new_var->SetOverridedStopGradient(self->OverridedStopGradient());
+              return new_var;
+            }
+          },
+          R"DOC(
         Returns a copy of this Tensor in CPU memory.
 
         If this Tensor is already in CPU memory, then no copy is performed and the original Tensor is returned.
@@ -1487,24 +1492,25 @@ void BindImperative(py::module *m_ptr) {
               print(y.place)    # CPUPlace
 
               )DOC")
-      .def("pin_memory",
-           [](const std::shared_ptr<imperative::VarBase> &self) {
+      .def(
+          "pin_memory",
+          [](const std::shared_ptr<imperative::VarBase> &self) {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-             PADDLE_THROW(platform::errors::PermissionDenied(
-                 "Cannot copy this Tensor to pinned memory in CPU version "
-                 "Paddle, "
-                 "Please recompile or reinstall Paddle with CUDA support."));
+            PADDLE_THROW(platform::errors::PermissionDenied(
+                "Cannot copy this Tensor to pinned memory in CPU version "
+                "Paddle, "
+                "Please recompile or reinstall Paddle with CUDA support."));
 #endif
-             if (platform::is_cuda_pinned_place(self->Place())) {
-               return self;
-             } else {
-               auto new_var =
-                   self->NewVarBase(platform::CUDAPinnedPlace(), true);
-               new_var->SetOverridedStopGradient(self->OverridedStopGradient());
-               return new_var;
-             }
-           },
-           R"DOC(
+            if (platform::is_cuda_pinned_place(self->Place())) {
+              return self;
+            } else {
+              auto new_var =
+                  self->NewVarBase(platform::CUDAPinnedPlace(), true);
+              new_var->SetOverridedStopGradient(self->OverridedStopGradient());
+              return new_var;
+            }
+          },
+          R"DOC(
         Returns a copy of this Tensor in pin memory.
 
         If this Tensor is already in pin memory, then no copy is performed and the original Tensor is returned.
@@ -1520,13 +1526,14 @@ void BindImperative(py::module *m_ptr) {
               print(y.place)      # CUDAPinnedPlace
 
       )DOC")
-      .def("cuda",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              py::handle &handle, bool blocking) {
+      .def(
+          "cuda",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             py::handle &handle, bool blocking) {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-             PADDLE_THROW(platform::errors::PermissionDenied(
-                 "Cannot copy this Tensor to GPU in CPU version Paddle, "
-                 "Please recompile or reinstall Paddle with CUDA support."));
+            PADDLE_THROW(platform::errors::PermissionDenied(
+                "Cannot copy this Tensor to GPU in CPU version Paddle, "
+                "Please recompile or reinstall Paddle with CUDA support."));
 #else
              int device_count = platform::GetGPUDeviceCount();
              int device_id = 0;
@@ -1563,8 +1570,8 @@ void BindImperative(py::module *m_ptr) {
                return new_var;
              }
 #endif
-           },
-           py::arg("device_id") = py::none(), py::arg("blocking") = true, R"DOC(
+          },
+          py::arg("device_id") = py::none(), py::arg("blocking") = true, R"DOC(
         Returns a copy of this Tensor in GPU memory.
 
         If this Tensor is already in GPU memory and device_id is default, 
@@ -1592,49 +1599,51 @@ void BindImperative(py::module *m_ptr) {
               y = x.cuda(1)
               print(y.place)        # CUDAPlace(1)
        )DOC")
-      .def("_share_memory",
-           [](const std::shared_ptr<imperative::VarBase> &self) {
+      .def(
+          "_share_memory",
+          [](const std::shared_ptr<imperative::VarBase> &self) {
 #ifndef _WIN32
-             PADDLE_ENFORCE_EQ(
-                 platform::is_cpu_place(self->Place()), true,
-                 platform::errors::InvalidArgument(
-                     "Sharing memory only support CPU Tensor currently"));
-             // 1. get LoDTensor
-             auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
-             // 2. allocate shared memory
-             void *data_ptr = t->data();
-             size_t data_size =
-                 t->numel() * framework::SizeOfType(
-                                  framework::TransToProtoVarType(t->dtype()));
-             auto shared_writer_holder =
-                 memory::allocation::AllocateMemoryMapWriterAllocation(
-                     data_size);
-             // 3. maintain mmap fd set & backup ipc_name
-             const std::string &ipc_name = shared_writer_holder->ipc_name();
-             memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
-             // 4. copy data & reset holder
-             memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
-                          platform::CPUPlace(), data_ptr, data_size);
-             t->ResetHolder(shared_writer_holder);
-             return *t;
+            PADDLE_ENFORCE_EQ(
+                platform::is_cpu_place(self->Place()), true,
+                platform::errors::InvalidArgument(
+                    "Sharing memory only support CPU Tensor currently"));
+            // 1. get LoDTensor
+            auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
+            // 2. allocate shared memory
+            void *data_ptr = t->data();
+            size_t data_size =
+                t->numel() * framework::SizeOfType(
+                                 framework::TransToProtoVarType(t->dtype()));
+            auto shared_writer_holder =
+                memory::allocation::AllocateMemoryMapWriterAllocation(
+                    data_size);
+            // 3. maintain mmap fd set & backup ipc_name
+            const std::string &ipc_name = shared_writer_holder->ipc_name();
+            memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+            // 4. copy data & reset holder
+            memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
+                         platform::CPUPlace(), data_ptr, data_size);
+            t->ResetHolder(shared_writer_holder);
+            return *t;
 #else
              PADDLE_THROW(platform::errors::PermissionDenied(
                  "Sharing memory in Windows OS is not supported currently"));
 #endif
-           },
-           py::return_value_policy::reference)
+          },
+          py::return_value_policy::reference)
 #if defined(PADDLE_WITH_CUDA)
-      .def("_uva",
-           [](const std::shared_ptr<imperative::VarBase> &self, int device_id) {
-             PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->Place()), true,
-                               platform::errors::InvalidArgument(
-                                   "Unified virtual addressing only support "
-                                   "CPU Tensor currently."));
-             auto *self_tensor =
-                 self->MutableVar()->GetMutable<framework::LoDTensor>();
-             tensor_uva(self_tensor, device_id);
-           },
-           py::arg("device_id") = 0, py::return_value_policy::reference, R"DOC(
+      .def(
+          "_uva",
+          [](const std::shared_ptr<imperative::VarBase> &self, int device_id) {
+            PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->Place()), true,
+                              platform::errors::InvalidArgument(
+                                  "Unified virtual addressing only support "
+                                  "CPU Tensor currently."));
+            auto *self_tensor =
+                self->MutableVar()->GetMutable<framework::LoDTensor>();
+            tensor_uva(self_tensor, device_id);
+          },
+          py::arg("device_id") = 0, py::return_value_policy::reference, R"DOC(
         Returns self tensor with the UVA(unified virtual addressing).
 
         Args:
@@ -1651,86 +1660,94 @@ void BindImperative(py::module *m_ptr) {
        )DOC")
 #endif
       .def("copy_", &imperative::VarBase::CopyFrom)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::CPUPlace &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             // Note(zhiqiu): Since NewVarBase may use GpuCopyAsync to
-             // copy data from the tensor of self to the tensor of new varbase,
-             // we need to ensure that the varbase self is not destructed until
-             // the GpuCopyAsync is completed. Otherwise, the memory may be
-             // freed
-             // when varbase self is destructed.
-             // To do that, we increase the reference count of self by 1 and
-             // add a cuda event to wait the GpuCopyAsync's completion.
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::CUDAPinnedPlace &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::XPUPlace &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::CUDAPlace &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::NPUPlace &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::MLUPlace &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_copy_to",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::Place &place, bool blocking) {
-             auto new_var = self->NewVarBase(place, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("value", [](imperative::VarBase &self) { return self.MutableVar(); },
-           py::return_value_policy::reference)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::CPUPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            // Note(zhiqiu): Since NewVarBase may use GpuCopyAsync to
+            // copy data from the tensor of self to the tensor of new varbase,
+            // we need to ensure that the varbase self is not destructed until
+            // the GpuCopyAsync is completed. Otherwise, the memory may be
+            // freed
+            // when varbase self is destructed.
+            // To do that, we increase the reference count of self by 1 and
+            // add a cuda event to wait the GpuCopyAsync's completion.
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::CUDAPinnedPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::XPUPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::CUDAPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::NPUPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::MLUPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::Place &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
+      .def(
+          "value", [](imperative::VarBase &self) { return self.MutableVar(); },
+          py::return_value_policy::reference)
       .def("_clear",
            [](const std::shared_ptr<imperative::VarBase> &self) {
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
@@ -1842,39 +1859,28 @@ void BindImperative(py::module *m_ptr) {
                     &imperative::VarBase::SetOverridedStopGradient)
       .def_property("persistable", &imperative::VarBase::Persistable,
                     &imperative::VarBase::SetPersistable)
-      .def_property_readonly("shape",
-                             [](imperative::VarBase &self) {
-                               if (self.Var().IsType<framework::LoDTensor>()) {
-                                 return phi::vectorize<int>(
-                                     self.Var()
-                                         .Get<framework::LoDTensor>()
-                                         .dims());
-                               } else if (self.Var()
-                                              .IsType<phi::SelectedRows>()) {
-                                 return phi::vectorize<int>(
-                                     self.Var()
-                                         .Get<phi::SelectedRows>()
-                                         .value()
-                                         .dims());
-                               } else if (self.Var()
-                                              .IsType<framework::Strings>()) {
-                                 return std::vector<int>{static_cast<int>(
-                                     self.Var()
-                                         .Get<framework::Strings>()
-                                         .size())};
-                               } else if (self.Var()
-                                              .IsType<framework::Vocab>()) {
-                                 return std::vector<int>{static_cast<int>(
-                                     self.Var()
-                                         .Get<framework::Vocab>()
-                                         .size())};
-                               } else {
-                                 VLOG(2) << "It is meaningless to get shape of "
-                                            "variable type "
-                                         << GetTypeName(self);
-                                 return std::vector<int>();
-                               }
-                             })
+      .def_property_readonly(
+          "shape",
+          [](imperative::VarBase &self) {
+            if (self.Var().IsType<framework::LoDTensor>()) {
+              return phi::vectorize<int>(
+                  self.Var().Get<framework::LoDTensor>().dims());
+            } else if (self.Var().IsType<phi::SelectedRows>()) {
+              return phi::vectorize<int>(
+                  self.Var().Get<phi::SelectedRows>().value().dims());
+            } else if (self.Var().IsType<framework::Strings>()) {
+              return std::vector<int>{static_cast<int>(
+                  self.Var().Get<framework::Strings>().size())};
+            } else if (self.Var().IsType<framework::Vocab>()) {
+              return std::vector<int>{
+                  static_cast<int>(self.Var().Get<framework::Vocab>().size())};
+            } else {
+              VLOG(2) << "It is meaningless to get shape of "
+                         "variable type "
+                      << GetTypeName(self);
+              return std::vector<int>();
+            }
+          })
       .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf,
                              R"DOC(
       Whether a Tensor is leaf Tensor.
@@ -2157,13 +2163,14 @@ void BindImperative(py::module *m_ptr) {
           [](imperative::ParallelStrategy &self, int nranks) {
             self.nranks_ = nranks;
           })
-      .def_property("local_rank",
-                    [](const imperative::ParallelStrategy &self) {
-                      return self.local_rank_;
-                    },
-                    [](imperative::ParallelStrategy &self, int local_rank) {
-                      self.local_rank_ = local_rank;
-                    })
+      .def_property(
+          "local_rank",
+          [](const imperative::ParallelStrategy &self) {
+            return self.local_rank_;
+          },
+          [](imperative::ParallelStrategy &self, int local_rank) {
+            self.local_rank_ = local_rank;
+          })
       .def_property(
           "trainer_endpoints",
           [](const imperative::ParallelStrategy &self) {
@@ -2172,12 +2179,14 @@ void BindImperative(py::module *m_ptr) {
           [](imperative::ParallelStrategy &self, std::vector<std::string> eps) {
             self.trainer_endpoints_ = eps;
           })
-      .def_property("current_endpoint",
-                    [](const imperative::ParallelStrategy &self) {
-                      return self.current_endpoint_;
-                    },
-                    [](imperative::ParallelStrategy &self,
-                       const std::string &ep) { self.current_endpoint_ = ep; })
+      .def_property(
+          "current_endpoint",
+          [](const imperative::ParallelStrategy &self) {
+            return self.current_endpoint_;
+          },
+          [](imperative::ParallelStrategy &self, const std::string &ep) {
+            self.current_endpoint_ = ep;
+          })
       .def_property(
           "nrings",
           [](const imperative::ParallelStrategy &self) { return self.nrings_; },
@@ -2359,43 +2368,44 @@ void BindImperative(py::module *m_ptr) {
         });
 
 #if defined(PADDLE_WITH_CUDA)
-  m.def("to_uva_tensor",
-        [](const py::object &obj, int device_id) {
-          const auto &tracer = imperative::GetCurrentTracer();
-          auto new_tensor = std::shared_ptr<imperative::VarBase>(
-              new imperative::VarBase(tracer->GenerateUniqueName()));
-          auto array = obj.cast<py::array>();
-          if (py::isinstance<py::array_t<int32_t>>(array)) {
-            SetUVATensorFromPyArray<int32_t>(new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<int64_t>>(array)) {
-            SetUVATensorFromPyArray<int64_t>(new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<float>>(array)) {
-            SetUVATensorFromPyArray<float>(new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<double>>(array)) {
-            SetUVATensorFromPyArray<double>(new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<int8_t>>(array)) {
-            SetUVATensorFromPyArray<int8_t>(new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<int16_t>>(array)) {
-            SetUVATensorFromPyArray<int16_t>(new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<paddle::platform::float16>>(
-                         array)) {
-            SetUVATensorFromPyArray<paddle::platform::float16>(
-                new_tensor, array, device_id);
-          } else if (py::isinstance<py::array_t<bool>>(array)) {
-            SetUVATensorFromPyArray<bool>(new_tensor, array, device_id);
-          } else {
-            // obj may be any type, obj.cast<py::array>() may be failed,
-            // then the array.dtype will be string of unknown meaning.
-            PADDLE_THROW(platform::errors::InvalidArgument(
-                "Input object type error or incompatible array data type. "
-                "tensor.set() supports array with bool, float16, float32, "
-                "float64, int8, int16, int32, int64,"
-                "please check your input or input array data type."));
-          }
-          return new_tensor;
-        },
-        py::arg("obj"), py::arg("device_id") = 0,
-        py::return_value_policy::reference, R"DOC(
+  m.def(
+      "to_uva_tensor",
+      [](const py::object &obj, int device_id) {
+        const auto &tracer = imperative::GetCurrentTracer();
+        auto new_tensor = std::shared_ptr<imperative::VarBase>(
+            new imperative::VarBase(tracer->GenerateUniqueName()));
+        auto array = obj.cast<py::array>();
+        if (py::isinstance<py::array_t<int32_t>>(array)) {
+          SetUVATensorFromPyArray<int32_t>(new_tensor, array, device_id);
+        } else if (py::isinstance<py::array_t<int64_t>>(array)) {
+          SetUVATensorFromPyArray<int64_t>(new_tensor, array, device_id);
+        } else if (py::isinstance<py::array_t<float>>(array)) {
+          SetUVATensorFromPyArray<float>(new_tensor, array, device_id);
+        } else if (py::isinstance<py::array_t<double>>(array)) {
+          SetUVATensorFromPyArray<double>(new_tensor, array, device_id);
+        } else if (py::isinstance<py::array_t<int8_t>>(array)) {
+          SetUVATensorFromPyArray<int8_t>(new_tensor, array, device_id);
+        } else if (py::isinstance<py::array_t<int16_t>>(array)) {
+          SetUVATensorFromPyArray<int16_t>(new_tensor, array, device_id);
+        } else if (py::isinstance<py::array_t<paddle::platform::float16>>(
+                       array)) {
+          SetUVATensorFromPyArray<paddle::platform::float16>(new_tensor, array,
+                                                             device_id);
+        } else if (py::isinstance<py::array_t<bool>>(array)) {
+          SetUVATensorFromPyArray<bool>(new_tensor, array, device_id);
+        } else {
+          // obj may be any type, obj.cast<py::array>() may be failed,
+          // then the array.dtype will be string of unknown meaning.
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Input object type error or incompatible array data type. "
+              "tensor.set() supports array with bool, float16, float32, "
+              "float64, int8, int16, int32, int64,"
+              "please check your input or input array data type."));
+        }
+        return new_tensor;
+      },
+      py::arg("obj"), py::arg("device_id") = 0,
+      py::return_value_policy::reference, R"DOC(
   Returns tensor with the UVA(unified virtual addressing) created from numpy array.
 
   Args:
diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h
index 0e3e98512d6..91b92944215 100644
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
+
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index d4c19364d48..d6ffbf01001 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/inference_api.h"
+
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
+
 #include <cstring>
 #include <functional>
 #include <iostream>
@@ -26,6 +28,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_infer_contrib.h"
@@ -75,8 +78,8 @@ using paddle::AnalysisPredictor;
 using paddle::NativeConfig;
 using paddle::NativePaddlePredictor;
 using paddle::PaddleBuf;
-using paddle::PaddleDType;
 using paddle::PaddleDataLayout;
+using paddle::PaddleDType;
 using paddle::PaddlePassBuilder;
 using paddle::PaddlePlace;
 using paddle::PaddlePredictor;
@@ -379,13 +382,13 @@ void BindInferenceApi(py::module *m) {
          &paddle::CreatePaddlePredictor<AnalysisConfig>, py::arg("config"));
   m->def("create_paddle_predictor",
          &paddle::CreatePaddlePredictor<NativeConfig>, py::arg("config"));
-  m->def("create_predictor", [](const paddle_infer::Config &config)
-                                 -> std::unique_ptr<paddle_infer::Predictor> {
-                                   auto pred =
-                                       std::unique_ptr<paddle_infer::Predictor>(
-                                           new paddle_infer::Predictor(config));
-                                   return pred;
-                                 });
+  m->def("create_predictor",
+         [](const paddle_infer::Config &config)
+             -> std::unique_ptr<paddle_infer::Predictor> {
+           auto pred = std::unique_ptr<paddle_infer::Predictor>(
+               new paddle_infer::Predictor(config));
+           return pred;
+         });
   m->def("copy_tensor", &CopyPaddleInferTensor);
   m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
   m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
@@ -578,11 +581,11 @@ void BindAnalysisConfig(py::module *m) {
       .def(py::init<const std::string &>())
       .def(py::init<const std::string &, const std::string &>())
       .def("summary", &AnalysisConfig::Summary)
-      .def("set_model", (void (AnalysisConfig::*)(const std::string &)) &
-                            AnalysisConfig::SetModel)
-      .def("set_model", (void (AnalysisConfig::*)(const std::string &,
-                                                  const std::string &)) &
+      .def("set_model", (void(AnalysisConfig::*)(const std::string &)) &
                             AnalysisConfig::SetModel)
+      .def("set_model",
+           (void(AnalysisConfig::*)(const std::string &, const std::string &)) &
+               AnalysisConfig::SetModel)
       .def("set_prog_file", &AnalysisConfig::SetProgFile)
       .def("set_params_file", &AnalysisConfig::SetParamsFile)
       .def("model_dir", &AnalysisConfig::model_dir)
@@ -716,11 +719,12 @@ void BindAnalysisConfig(py::module *m) {
            [](AnalysisConfig &self, const std::string &pass) {
              self.pass_builder()->DeletePass(pass);
            })
-      .def("pass_builder",
-           [](AnalysisConfig &self) {
-             return dynamic_cast<PaddlePassBuilder *>(self.pass_builder());
-           },
-           py::return_value_policy::reference)
+      .def(
+          "pass_builder",
+          [](AnalysisConfig &self) {
+            return dynamic_cast<PaddlePassBuilder *>(self.pass_builder());
+          },
+          py::return_value_policy::reference)
       .def("nnadapter", &AnalysisConfig::NNAdapter)
       .def("set_dist_config", &AnalysisConfig::SetDistConfig)
       .def("dist_config", &AnalysisConfig::dist_config);
diff --git a/paddle/fluid/pybind/io.cc b/paddle/fluid/pybind/io.cc
index a7222abf45c..c8806962421 100644
--- a/paddle/fluid/pybind/io.cc
+++ b/paddle/fluid/pybind/io.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/io.h"
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/pybind/io.h b/paddle/fluid/pybind/io.h
index 942c93deccf..7f10306e919 100644
--- a/paddle/fluid/pybind/io.h
+++ b/paddle/fluid/pybind/io.h
@@ -20,6 +20,7 @@ typedef SSIZE_T ssize_t;
 #endif
 
 #include <Python.h>
+
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index ecbacd37d56..ef005ee8b10 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/ir.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -31,18 +33,18 @@
 #include "pybind11/stl.h"
 
 namespace py = pybind11;
-using paddle::framework::ir::Graph;
-using paddle::framework::ir::Node;
-using paddle::framework::ir::NodeComp;
-using paddle::framework::ir::GraphSafeRemoveNodes;
-using paddle::framework::ir::HasCircle;
-using paddle::framework::ir::GraphNum;
-using paddle::framework::ir::TopologySortOperations;
-using paddle::framework::ir::BuildOperationAdjList;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 using paddle::framework::Scope;
 using paddle::framework::VarDesc;
+using paddle::framework::ir::BuildOperationAdjList;
+using paddle::framework::ir::Graph;
+using paddle::framework::ir::GraphNum;
+using paddle::framework::ir::GraphSafeRemoveNodes;
+using paddle::framework::ir::HasCircle;
+using paddle::framework::ir::Node;
+using paddle::framework::ir::NodeComp;
+using paddle::framework::ir::TopologySortOperations;
 using pybind11::return_value_policy;
 
 namespace paddle {
@@ -104,16 +106,18 @@ void BindGraph(py::module *m) {
            })
       .def("erase", &Graph::Erase)
       .def("nodes", &Graph::Nodes, return_value_policy::reference)
-      .def("create_var_node",
-           [](Graph &self, VarDesc &var_desc) {
-             return self.CreateVarNode(&var_desc);
-           },
-           return_value_policy::reference)
-      .def("create_op_node",
-           [](Graph &self, OpDesc &op_desc) {
-             return self.CreateOpNode(&op_desc);
-           },
-           return_value_policy::reference)
+      .def(
+          "create_var_node",
+          [](Graph &self, VarDesc &var_desc) {
+            return self.CreateVarNode(&var_desc);
+          },
+          return_value_policy::reference)
+      .def(
+          "create_op_node",
+          [](Graph &self, OpDesc &op_desc) {
+            return self.CreateOpNode(&op_desc);
+          },
+          return_value_policy::reference)
       .def("create_control_dep_var", &Graph::CreateControlDepVar,
            return_value_policy::reference)
       .def("create_empty_node", &Graph::CreateEmptyNode,
diff --git a/paddle/fluid/pybind/ir.h b/paddle/fluid/pybind/ir.h
index 2cc1459bbe0..ad2d6aa11bf 100644
--- a/paddle/fluid/pybind/ir.h
+++ b/paddle/fluid/pybind/ir.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <pybind11/pybind11.h>
+
 #include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 8b9b98eba12..a3c6fa14765 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/pybind/op_function_common.h"
+
 #include <pybind11/chrono.h>
 #include <pybind11/complex.h>
 #include <pybind11/functional.h>
@@ -28,7 +30,6 @@
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/pybind/imperative.h"
-#include "paddle/fluid/pybind/op_function_common.h"
 
 namespace py = pybind11;
 namespace paddle {
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 66bf8c95179..329b3b83337 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -76,11 +76,12 @@ void BindProgramDesc(pybind11::module *m) {
                  platform::errors::InvalidArgument(
                      "Failed to parse ProgramDesc from binary string."));
            })
-      .def("_set_version",
-           [](pd::ProgramDesc &self, int64_t version) {
-             return self.SetVersion(version);
-           },
-           pybind11::arg("version") = pd::kCurProgramVersion)
+      .def(
+          "_set_version",
+          [](pd::ProgramDesc &self, int64_t version) {
+            return self.SetVersion(version);
+          },
+          pybind11::arg("version") = pd::kCurProgramVersion)
       .def("_version",
            [](pd::ProgramDesc &self) -> int64_t { return self.Version(); })
       .def("get_op_deps", [](const framework::ProgramDesc &program) {
@@ -113,18 +114,20 @@ void BindBlockDesc(pybind11::module *m) {
       .def("_insert_op", &pd::BlockDesc::InsertOp,
            pybind11::return_value_policy::reference)
       .def("_remove_op", &pd::BlockDesc::RemoveOp)
-      .def("var",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.Var(name);
-           },
-           pybind11::return_value_policy::reference)
-      .def("has_var",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.HasVar(name);
-           },
-           pybind11::return_value_policy::reference)
+      .def(
+          "var",
+          [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+            std::string name = byte_name;
+            return self.Var(name);
+          },
+          pybind11::return_value_policy::reference)
+      .def(
+          "has_var",
+          [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+            std::string name = byte_name;
+            return self.HasVar(name);
+          },
+          pybind11::return_value_policy::reference)
       .def("_rename_var",
            [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
               const pybind11::bytes &byte_name_new) {
@@ -137,24 +140,27 @@ void BindBlockDesc(pybind11::module *m) {
              std::string name = byte_name;
              return self.HasVarRecursive(name);
            })
-      .def("find_var",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.FindVar(name);
-           },
-           pybind11::return_value_policy::reference)
-      .def("find_var_recursive",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.FindVarRecursive(name);
-           },
-           pybind11::return_value_policy::reference)
-      .def("_remove_var",
-           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-             std::string name = byte_name;
-             return self.RemoveVar(name);
-           },
-           pybind11::return_value_policy::reference)
+      .def(
+          "find_var",
+          [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+            std::string name = byte_name;
+            return self.FindVar(name);
+          },
+          pybind11::return_value_policy::reference)
+      .def(
+          "find_var_recursive",
+          [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+            std::string name = byte_name;
+            return self.FindVarRecursive(name);
+          },
+          pybind11::return_value_policy::reference)
+      .def(
+          "_remove_var",
+          [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+            std::string name = byte_name;
+            return self.RemoveVar(name);
+          },
+          pybind11::return_value_policy::reference)
       .def("all_vars", &pd::BlockDesc::AllVars,
            pybind11::return_value_policy::reference)
       .def("op_size", &pd::BlockDesc::OpSize)
@@ -258,8 +264,9 @@ void BindOpDesc(pybind11::module *m) {
 
   pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
   op_desc
-      .def("__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); },
-           pybind11::return_value_policy::reference)
+      .def(
+          "__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); },
+          pybind11::return_value_policy::reference)
       .def("copy_from", &pd::OpDesc::CopyFrom)
       .def("type", &pd::OpDesc::Type)
       .def("set_type", &pd::OpDesc::SetType)
@@ -304,8 +311,9 @@ void BindOpDesc(pybind11::module *m) {
       .def("infer_var_type", &pd::OpDesc::InferVarType)
       .def("set_is_target", &pd::OpDesc::SetIsTarget)
       .def("serialize_to_string", SerializeMessage<pd::OpDesc>)
-      .def("block", [](pd::OpDesc &self) { return self.Block(); },
-           pybind11::return_value_policy::reference)
+      .def(
+          "block", [](pd::OpDesc &self) { return self.Block(); },
+          pybind11::return_value_policy::reference)
       .def("id", &pd::OpDesc::Id)
       .def("original_id", &pd::OpDesc::OriginalId)
       .def("set_original_id", &pd::OpDesc::SetOriginalId)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d1c2b28dc80..cba7d036235 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -693,56 +693,56 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("_get_use_default_grad_op_desc_maker_ops",
         [] { return OpInfoMap::Instance().GetUseDefaultGradOpDescMakerOps(); });
 
-  m.def("_get_all_register_op_kernels",
-        [](const std::string &lib) {
-          std::unordered_map<std::string, std::vector<std::string>>
-              all_kernels_info;
-          if (lib == "fluid" || lib == "all") {
-            auto &all_kernels =
-                paddle::framework::OperatorWithKernel::AllOpKernels();
-
-            for (auto &kernel_pair : all_kernels) {
-              auto op_type = kernel_pair.first;
-              std::vector<std::string> kernel_types;
-              for (auto &info_pair : kernel_pair.second) {
-                paddle::framework::OpKernelType kernel_type = info_pair.first;
-                kernel_types.emplace_back(
-                    paddle::framework::KernelTypeToString(kernel_type));
-              }
-              all_kernels_info.emplace(op_type, kernel_types);
+  m.def(
+      "_get_all_register_op_kernels",
+      [](const std::string &lib) {
+        std::unordered_map<std::string, std::vector<std::string>>
+            all_kernels_info;
+        if (lib == "fluid" || lib == "all") {
+          auto &all_kernels =
+              paddle::framework::OperatorWithKernel::AllOpKernels();
+
+          for (auto &kernel_pair : all_kernels) {
+            auto op_type = kernel_pair.first;
+            std::vector<std::string> kernel_types;
+            for (auto &info_pair : kernel_pair.second) {
+              paddle::framework::OpKernelType kernel_type = info_pair.first;
+              kernel_types.emplace_back(
+                  paddle::framework::KernelTypeToString(kernel_type));
             }
+            all_kernels_info.emplace(op_type, kernel_types);
           }
-          if (lib == "phi" || lib == "all") {
-            auto phi_kernels = phi::KernelFactory::Instance().kernels();
-            for (auto &kernel_pair : phi_kernels) {
-              auto op_type = phi::TransToFluidOpName(kernel_pair.first);
-              std::vector<std::string> kernel_types;
-              for (auto &info_pair : kernel_pair.second) {
-                framework::OpKernelType kernel_type =
-                    framework::TransPhiKernelKeyToOpKernelType(info_pair.first);
-                auto kernel_type_str =
-                    framework::KernelTypeToString(kernel_type);
-                if (all_kernels_info.count(op_type)) {
-                  if (std::find(all_kernels_info[op_type].begin(),
-                                all_kernels_info[op_type].end(),
-                                kernel_type_str) ==
-                      all_kernels_info[op_type].end()) {
-                    all_kernels_info[op_type].emplace_back(kernel_type_str);
-                  }
-                } else {
-                  kernel_types.emplace_back(kernel_type_str);
+        }
+        if (lib == "phi" || lib == "all") {
+          auto phi_kernels = phi::KernelFactory::Instance().kernels();
+          for (auto &kernel_pair : phi_kernels) {
+            auto op_type = phi::TransToFluidOpName(kernel_pair.first);
+            std::vector<std::string> kernel_types;
+            for (auto &info_pair : kernel_pair.second) {
+              framework::OpKernelType kernel_type =
+                  framework::TransPhiKernelKeyToOpKernelType(info_pair.first);
+              auto kernel_type_str = framework::KernelTypeToString(kernel_type);
+              if (all_kernels_info.count(op_type)) {
+                if (std::find(all_kernels_info[op_type].begin(),
+                              all_kernels_info[op_type].end(),
+                              kernel_type_str) ==
+                    all_kernels_info[op_type].end()) {
+                  all_kernels_info[op_type].emplace_back(kernel_type_str);
                 }
-              }
-              if (!kernel_types.empty()) {
-                all_kernels_info.emplace(op_type, kernel_types);
+              } else {
+                kernel_types.emplace_back(kernel_type_str);
               }
             }
+            if (!kernel_types.empty()) {
+              all_kernels_info.emplace(op_type, kernel_types);
+            }
           }
+        }
 
-          return all_kernels_info;
-        },
-        py::arg("lib") = "all",
-        R"DOC(
+        return all_kernels_info;
+      },
+      py::arg("lib") = "all",
+      R"DOC(
            Return the registered kernels in paddle.
 
            Args:
@@ -1011,9 +1011,10 @@ PYBIND11_MODULE(core_noavx, m) {
                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
           )DOC")
 
-      .def("shape",
-           [](framework::Tensor &self) { return vectorize(self.dims()); },
-           R"DOC(
+      .def(
+          "shape",
+          [](framework::Tensor &self) { return vectorize(self.dims()); },
+          R"DOC(
            Return the shape of Tensor.
 
            Returns:
@@ -1101,20 +1102,21 @@ PYBIND11_MODULE(core_noavx, m) {
       // avoid misuse.
       // The discussion is here:
       // https://github.com/PaddlePaddle/Paddle/issues/10855
-      .def("set_lod",
-           [](framework::Tensor &self,
-              const std::vector<std::vector<size_t>> &lod) {
-             // the input lod is offset-based level-of-detail info
-             LoD new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_lod, vectorize(self.dims()).front()), true,
-                 platform::errors::InvalidArgument(
-                     "The provided LoD is invalid, the LoD is %s", new_lod));
-             self.set_lod(new_lod);
-           },
-           py::arg("lod"), R"DOC(
+      .def(
+          "set_lod",
+          [](framework::Tensor &self,
+             const std::vector<std::vector<size_t>> &lod) {
+            // the input lod is offset-based level-of-detail info
+            LoD new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            PADDLE_ENFORCE_EQ(
+                CheckLoD(new_lod, vectorize(self.dims()).front()), true,
+                platform::errors::InvalidArgument(
+                    "The provided LoD is invalid, the LoD is %s", new_lod));
+            self.set_lod(new_lod);
+          },
+          py::arg("lod"), R"DOC(
            Set LoD of the Tensor.
 
            Args:
@@ -1134,28 +1136,29 @@ PYBIND11_MODULE(core_noavx, m) {
                  t.set_lod([[0, 2, 5]])
                  print(t.lod()) # [[0, 2, 5]]
            )DOC")
-      .def("set_recursive_sequence_lengths",
-           [](framework::Tensor &self, const std::vector<std::vector<size_t>>
-                                           &recursive_sequence_lengths) {
-             // the input recursive_sequence_lengths is length-based
-             // level-of-detail info
-             LoD new_lod;
-             new_lod.reserve(recursive_sequence_lengths.size());
-             std::copy(recursive_sequence_lengths.begin(),
-                       recursive_sequence_lengths.end(),
-                       std::back_inserter(new_lod));
-             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true,
-                 platform::errors::InvalidArgument(
-                     "The provided recursive_sequence_lengths info is "
-                     "invalid, "
-                     "the LoD converted by recursive_sequence_lengths is "
-                     "%s",
-                     new_lod));
-             self.set_lod(new_offset_lod);
-           },
-           py::arg("recursive_sequence_lengths"), R"DOC(
+      .def(
+          "set_recursive_sequence_lengths",
+          [](framework::Tensor &self, const std::vector<std::vector<size_t>>
+                                          &recursive_sequence_lengths) {
+            // the input recursive_sequence_lengths is length-based
+            // level-of-detail info
+            LoD new_lod;
+            new_lod.reserve(recursive_sequence_lengths.size());
+            std::copy(recursive_sequence_lengths.begin(),
+                      recursive_sequence_lengths.end(),
+                      std::back_inserter(new_lod));
+            LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+            PADDLE_ENFORCE_EQ(
+                CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true,
+                platform::errors::InvalidArgument(
+                    "The provided recursive_sequence_lengths info is "
+                    "invalid, "
+                    "the LoD converted by recursive_sequence_lengths is "
+                    "%s",
+                    new_lod));
+            self.set_lod(new_offset_lod);
+          },
+          py::arg("recursive_sequence_lengths"), R"DOC(
            Set LoD of the Tensor according to recursive sequence lengths.
 
            For example, if recursive_sequence_lengths=[[2, 3]], which means
@@ -1180,16 +1183,17 @@ PYBIND11_MODULE(core_noavx, m) {
                  print(t.recursive_sequence_lengths())  # [[2, 3]]
                  print(t.lod())  # [[0, 2, 5]]
            )DOC")
-      .def("lod",
-           [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
-             // output the offset-based lod info
-             LoD lod = self.lod();
-             std::vector<std::vector<size_t>> new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             return new_lod;
-           },
-           R"DOC(
+      .def(
+          "lod",
+          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
+            // output the offset-based lod info
+            LoD lod = self.lod();
+            std::vector<std::vector<size_t>> new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            return new_lod;
+          },
+          R"DOC(
            Return the LoD of the Tensor.
 
            Returns:
@@ -1207,16 +1211,17 @@ PYBIND11_MODULE(core_noavx, m) {
                  print(t.lod()) # [[0, 2, 5]]
            )DOC")
       // Set above comments of set_lod.
-      .def("recursive_sequence_lengths",
-           [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
-             // output the length-based lod info
-             LoD lod = phi::ConvertToLengthBasedLoD(self.lod());
-             std::vector<std::vector<size_t>> new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             return new_lod;
-           },
-           R"DOC(
+      .def(
+          "recursive_sequence_lengths",
+          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
+            // output the length-based lod info
+            LoD lod = phi::ConvertToLengthBasedLoD(self.lod());
+            std::vector<std::vector<size_t>> new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            return new_lod;
+          },
+          R"DOC(
            Return the recursive sequence lengths corresponding to of the LodD 
            of the Tensor.
 
@@ -1234,13 +1239,14 @@ PYBIND11_MODULE(core_noavx, m) {
                  t.set_recursive_sequence_lengths([[2, 3]])
                  print(t.recursive_sequence_lengths()) # [[2, 3]]
            )DOC")
-      .def("has_valid_recursive_sequence_lengths",
-           [](framework::Tensor &self) -> bool {
-             // Check that the lod info is valid and match the outermost
-             // dimension of the Tensor data
-             return CheckLoD(self.lod(), vectorize(self.dims()).front());
-           },
-           R"DOC(
+      .def(
+          "has_valid_recursive_sequence_lengths",
+          [](framework::Tensor &self) -> bool {
+            // Check that the lod info is valid and match the outermost
+            // dimension of the Tensor data
+            return CheckLoD(self.lod(), vectorize(self.dims()).front());
+          },
+          R"DOC(
            Check whether the LoD of the Tensor is valid.
 
            Returns:
@@ -1624,9 +1630,10 @@ PYBIND11_MODULE(core_noavx, m) {
               const int64_t &height) {
              new (&instance) phi::SelectedRows(rows, height);
            })
-      .def("get_tensor",
-           [](phi::SelectedRows &self) { return self.mutable_value(); },
-           py::return_value_policy::reference)
+      .def(
+          "get_tensor",
+          [](phi::SelectedRows &self) { return self.mutable_value(); },
+          py::return_value_policy::reference)
       .def("numel",
            [](phi::SelectedRows &self) -> int64_t {
              return self.value().numel();
@@ -1668,11 +1675,12 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("get_float",
            [](const Variable &var) -> float { return var.Get<float>(); })
-      .def("get_tensor",
-           [](Variable &self) -> LoDTensor * {
-             return self.GetMutable<LoDTensor>();
-           },
-           py::return_value_policy::reference)
+      .def(
+          "get_tensor",
+          [](Variable &self) -> LoDTensor * {
+            return self.GetMutable<LoDTensor>();
+          },
+          py::return_value_policy::reference)
       .def("get_bytes",
            [](Variable &self) {
              return py::bytes(*self.GetMutable<std::string>());
@@ -1683,53 +1691,60 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("set_vocab", [](Variable &self,
                            Vocab vocab) { *self.GetMutable<Vocab>() = vocab; })
-      .def("get_string_tensor",
-           [](Variable &self) { return self.GetMutable<Strings>(); },
-           py::return_value_policy::reference)
-      .def("get_map_tensor",
-           [](Variable &self) { return self.GetMutable<Vocab>(); },
-           py::return_value_policy::reference)
-      .def("get_lod_rank_table",
-           [](Variable &self) { return self.GetMutable<LoDRankTable>(); },
-           py::return_value_policy::reference)
-      .def("get_selected_rows",
-           [](Variable &self) -> phi::SelectedRows * {
-             return self.GetMutable<phi::SelectedRows>();
-           },
-           py::return_value_policy::reference)
-      .def("get_lod_tensor_array",
-           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
-           py::return_value_policy::reference)
-      .def("get_fetch_list",
-           [](Variable &self) { return self.GetMutable<FetchList>(); },
-           py::return_value_policy::reference)
+      .def(
+          "get_string_tensor",
+          [](Variable &self) { return self.GetMutable<Strings>(); },
+          py::return_value_policy::reference)
+      .def(
+          "get_map_tensor",
+          [](Variable &self) { return self.GetMutable<Vocab>(); },
+          py::return_value_policy::reference)
+      .def(
+          "get_lod_rank_table",
+          [](Variable &self) { return self.GetMutable<LoDRankTable>(); },
+          py::return_value_policy::reference)
+      .def(
+          "get_selected_rows",
+          [](Variable &self) -> phi::SelectedRows * {
+            return self.GetMutable<phi::SelectedRows>();
+          },
+          py::return_value_policy::reference)
+      .def(
+          "get_lod_tensor_array",
+          [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
+          py::return_value_policy::reference)
+      .def(
+          "get_fetch_list",
+          [](Variable &self) { return self.GetMutable<FetchList>(); },
+          py::return_value_policy::reference)
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      .def("get_communicator",
-           [](Variable &self) -> platform::Communicator * {
-             return self.GetMutable<platform::Communicator>();
-           },
-           py::return_value_policy::reference)
+      .def(
+          "get_communicator",
+          [](Variable &self) -> platform::Communicator * {
+            return self.GetMutable<platform::Communicator>();
+          },
+          py::return_value_policy::reference)
 #endif
-      .def("get_reader",
-           [](Variable &self) -> framework::ReaderHolder * {
-             PADDLE_ENFORCE_EQ(
-                 self.IsType<framework::ReaderHolder>(), true,
-                 platform::errors::InvalidArgument(
-                     "The variable is not type of ReaderHolder."));
-             return self.GetMutable<framework::ReaderHolder>();
-           },
-           py::return_value_policy::reference)
-      .def("get_scope",
-           [](Variable &self) -> Scope * {
-             auto scope_vec =
-                 self.GetMutable<std::vector<framework::Scope *>>();
-             PADDLE_ENFORCE_GT(
-                 scope_vec->size(), 0,
-                 platform::errors::InvalidArgument(
-                     "The size of scope_vec should be greater than 0"));
-             return scope_vec->front();
-           },
-           py::return_value_policy::reference)
+      .def(
+          "get_reader",
+          [](Variable &self) -> framework::ReaderHolder * {
+            PADDLE_ENFORCE_EQ(self.IsType<framework::ReaderHolder>(), true,
+                              platform::errors::InvalidArgument(
+                                  "The variable is not type of ReaderHolder."));
+            return self.GetMutable<framework::ReaderHolder>();
+          },
+          py::return_value_policy::reference)
+      .def(
+          "get_scope",
+          [](Variable &self) -> Scope * {
+            auto scope_vec = self.GetMutable<std::vector<framework::Scope *>>();
+            PADDLE_ENFORCE_GT(
+                scope_vec->size(), 0,
+                platform::errors::InvalidArgument(
+                    "The size of scope_vec should be greater than 0"));
+            return scope_vec->front();
+          },
+          py::return_value_policy::reference)
       .def("set_scope", [](Variable &self, Scope &scope) {
         auto scope_vec = self.GetMutable<std::vector<framework::Scope *>>();
         scope_vec->emplace_back(&scope);
@@ -1762,12 +1777,13 @@ All parameter, weight, gradient are variables in Paddle.
   _Scope
       .def("_remove_from_pool",
            [](Scope &self) { ScopePool::Instance().Remove(&self); })
-      .def("var",
-           [](Scope &self, const std::string &name) -> Variable * {
-             return self.Var(name);
-           },
-           py::arg("name"),
-           R"DOC(
+      .def(
+          "var",
+          [](Scope &self, const std::string &name) -> Variable * {
+            return self.Var(name);
+          },
+          py::arg("name"),
+          R"DOC(
            Find or create variable named :code:`name` in the current scope.
 
            If the variable named :code:`name` does not exist in the
@@ -1780,7 +1796,7 @@ All parameter, weight, gradient are variables in Paddle.
            Returns:
                out (core.Variable): the found or created variable.
            )DOC",
-           py::return_value_policy::reference)
+          py::return_value_policy::reference)
       .def("find_var", &Scope::FindVar, py::arg("name"),
            R"DOC(
            Find variable named :code:`name` in the current scope or
@@ -1806,33 +1822,35 @@ All parameter, weight, gradient are variables in Paddle.
                None
            )DOC",
            py::return_value_policy::reference)
-      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
-           R"DOC(
+      .def(
+          "new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+          R"DOC(
            Create a new sub-scope of the current scope.
 
            Returns:
                out (core._Scope): the created sub-scope.
            )DOC",
-           py::return_value_policy::reference)
+          py::return_value_policy::reference)
       .def("drop_kids", &Scope::DropKids,
            R"DOC(
            Delete all sub-scopes of the current scope.
            )DOC")
       .def("_kids", &Scope::kids);
 
-  m.def("Scope",
-        []() -> Scope * {
-          auto *s = new Scope();
-          ScopePool::Instance().Insert(std::unique_ptr<Scope>(s));
-          return s;
-        },
-        R"DOC(
+  m.def(
+      "Scope",
+      []() -> Scope * {
+        auto *s = new Scope();
+        ScopePool::Instance().Insert(std::unique_ptr<Scope>(s));
+        return s;
+      },
+      R"DOC(
         Create a new scope.
 
         Returns:
             out (core._Scope): the created scope.
         )DOC",
-        py::return_value_policy::reference);
+      py::return_value_policy::reference);
 
   //! @note: Be careful! PyBind will return std::string as an unicode, not
   //! Python str. If you want a str object, you should cast them in Python.
@@ -1919,11 +1937,12 @@ All parameter, weight, gradient are variables in Paddle.
     return std::make_tuple(ProgramDesc(pruned_desc),
                            pruned_origin_block_id_map);
   });
-  m.def("prune_backward",
-        [](const framework::ProgramDesc &program) {
-          return PruneBackward(program);
-        },
-        R"DOC(
+  m.def(
+      "prune_backward",
+      [](const framework::ProgramDesc &program) {
+        return PruneBackward(program);
+      },
+      R"DOC(
              Prune the backward part of a program, mostly called in
              program.clone(for_test=True).
               
@@ -2790,8 +2809,8 @@ All parameter, weight, gradient are variables in Paddle.
       .def("outputs",
            [](const OperatorBase &op)
                -> std::map<std::string, std::vector<std::string>> {
-                 return op.Outputs();
-               })
+             return op.Outputs();
+           })
       .def("output_vars",
            [](const OperatorBase &op) { return op.OutputVars(true); })
       .def("inputs", [](const OperatorBase &op) { return op.Inputs(); })
@@ -2806,11 +2825,12 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<framework::TrainerBase, std::shared_ptr<framework::TrainerBase>>(
       m, "TrainerBase")
-      .def("get_worker_scope",
-           [](TrainerBase &self, int thread_id) -> Scope * {
-             return self.GetWorkerScope(thread_id);
-           },
-           py::return_value_policy::reference)
+      .def(
+          "get_worker_scope",
+          [](TrainerBase &self, int thread_id) -> Scope * {
+            return self.GetWorkerScope(thread_id);
+          },
+          py::return_value_policy::reference)
       .def("finalize", &TrainerBase::Finalize)
       .def("ResetDataset", &TrainerBase::ResetDataset);
 
@@ -3010,21 +3030,23 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("device_memory_stat_current_value",
         memory::DeviceMemoryStatCurrentValue);
   m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
-  m.def("run_cmd",
-        [](const std::string &cmd, int time_out = -1,
-           int sleep_inter = -1) -> const std::string {
-          return paddle::framework::shell_get_command_output(cmd, time_out,
-                                                             sleep_inter);
-        },
-        py::arg("cmd"), py::arg("time_out") = -1, py::arg("sleep_inter") = -1);
-  m.def("shell_execute_cmd",
-        [](const std::string &cmd, int time_out = 0, int sleep_inter = 0,
-           bool redirect_stderr = false) -> std::vector<std::string> {
-          return paddle::framework::shell_execute_cmd(
-              cmd, time_out, sleep_inter, redirect_stderr);
-        },
-        py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0,
-        py::arg("redirect_stderr") = false);
+  m.def(
+      "run_cmd",
+      [](const std::string &cmd, int time_out = -1,
+         int sleep_inter = -1) -> const std::string {
+        return paddle::framework::shell_get_command_output(cmd, time_out,
+                                                           sleep_inter);
+      },
+      py::arg("cmd"), py::arg("time_out") = -1, py::arg("sleep_inter") = -1);
+  m.def(
+      "shell_execute_cmd",
+      [](const std::string &cmd, int time_out = 0, int sleep_inter = 0,
+         bool redirect_stderr = false) -> std::vector<std::string> {
+        return paddle::framework::shell_execute_cmd(cmd, time_out, sleep_inter,
+                                                    redirect_stderr);
+      },
+      py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0,
+      py::arg("redirect_stderr") = false);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
@@ -3092,9 +3114,10 @@ All parameter, weight, gradient are variables in Paddle.
   pylodtensorarray
       .def("__init__",
            [](LoDTensorArray &instance) { new (&instance) LoDTensorArray(); })
-      .def("__getitem__",
-           [](LoDTensorArray &self, size_t i) { return &self.at(i); },
-           py::return_value_policy::reference)
+      .def(
+          "__getitem__",
+          [](LoDTensorArray &self, size_t i) { return &self.at(i); },
+          py::return_value_policy::reference)
       .def("__len__", [](LoDTensorArray &self) { return self.size(); })
       .def("__setitem__",
            [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
@@ -3105,13 +3128,14 @@ All parameter, weight, gradient are variables in Paddle.
              self[i].ShareDataWith(t);
              self[i].set_lod(t.lod());
            })
-      .def("append",
-           [](LoDTensorArray &self, const LoDTensor &t) {
-             self.emplace_back();
-             self.back().ShareDataWith(t);
-             self.back().set_lod(t.lod());
-           },
-           py::arg("tensor"), R"DOC(
+      .def(
+          "append",
+          [](LoDTensorArray &self, const LoDTensor &t) {
+            self.emplace_back();
+            self.back().ShareDataWith(t);
+            self.back().set_lod(t.lod());
+          },
+          py::arg("tensor"), R"DOC(
              Append a LoDensor to LoDTensorArray.
               
              Args:
@@ -3131,89 +3155,94 @@ All parameter, weight, gradient are variables in Paddle.
                    t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                    arr.append(t)
            )DOC")
-      .def("_move_to_list",
-           [](LoDTensorArray &self) -> py::list {
-             py::list res(self.size());
-             for (size_t i = 0; i < self.size(); ++i) {
-               res[i] = py::cast(std::move(self[i]));
-             }
-             self.clear();
-             return res;
-           },
-           py::return_value_policy::take_ownership);
+      .def(
+          "_move_to_list",
+          [](LoDTensorArray &self) -> py::list {
+            py::list res(self.size());
+            for (size_t i = 0; i < self.size(); ++i) {
+              res[i] = py::cast(std::move(self[i]));
+            }
+            self.clear();
+            return res;
+          },
+          py::return_value_policy::take_ownership);
 
   py::class_<FetchList>(m, "FetchList", R"DOC( FetchList is a
         vector of boost::variant<LoDTensor, LoDTensorArray>.
         )DOC")
-      .def("_move_to_list",
-           [](FetchList &self) -> py::list {
-             py::list res(self.size());
-             for (size_t i = 0; i < self.size(); ++i) {
-               if (data_is_lod_tensor(self[i])) {
-                 auto &data = BOOST_GET(LoDTensor, self[i]);
-                 res[i] = py::cast(std::move(data));
-               } else {
-                 auto &data = BOOST_GET(LoDTensorArray, self[i]);
-                 py::list tmp(data.size());
-                 for (size_t j = 0; j < data.size(); ++j) {
-                   tmp[j] = py::cast(std::move(data[j]));
-                 }
-                 res[i] = std::move(tmp);
-               }
-             }
-             self.clear();
-             return res;
-           },
-           py::return_value_policy::take_ownership)
+      .def(
+          "_move_to_list",
+          [](FetchList &self) -> py::list {
+            py::list res(self.size());
+            for (size_t i = 0; i < self.size(); ++i) {
+              if (data_is_lod_tensor(self[i])) {
+                auto &data = BOOST_GET(LoDTensor, self[i]);
+                res[i] = py::cast(std::move(data));
+              } else {
+                auto &data = BOOST_GET(LoDTensorArray, self[i]);
+                py::list tmp(data.size());
+                for (size_t j = 0; j < data.size(); ++j) {
+                  tmp[j] = py::cast(std::move(data[j]));
+                }
+                res[i] = std::move(tmp);
+              }
+            }
+            self.clear();
+            return res;
+          },
+          py::return_value_policy::take_ownership)
 
-      .def("append",
-           [](FetchList &self, const LoDTensor &t) {
-             self.emplace_back();
-             auto &lod_tensor = BOOST_GET(LoDTensor, self.back());
-             lod_tensor.ShareDataWith(t);
-             lod_tensor.set_lod(t.lod());
-           },
-           py::arg("var"))
-
-      .def("append",
-           [](FetchList &self, const LoDTensorArray &t) {
-             self.emplace_back();
-             auto &lod_tensor_array = BOOST_GET(LoDTensorArray, self.back());
-             for (size_t i = 0; i < t.size(); ++i) {
-               lod_tensor_array[i].ShareDataWith(t[i]);
-               lod_tensor_array[i].set_lod(t[i].lod());
-             }
-           },
-           py::arg("var"));
+      .def(
+          "append",
+          [](FetchList &self, const LoDTensor &t) {
+            self.emplace_back();
+            auto &lod_tensor = BOOST_GET(LoDTensor, self.back());
+            lod_tensor.ShareDataWith(t);
+            lod_tensor.set_lod(t.lod());
+          },
+          py::arg("var"))
+
+      .def(
+          "append",
+          [](FetchList &self, const LoDTensorArray &t) {
+            self.emplace_back();
+            auto &lod_tensor_array = BOOST_GET(LoDTensorArray, self.back());
+            for (size_t i = 0; i < t.size(); ++i) {
+              lod_tensor_array[i].ShareDataWith(t[i]);
+              lod_tensor_array[i].set_lod(t[i].lod());
+            }
+          },
+          py::arg("var"));
 
   py::class_<FetchUnmergedList>(m, "FetchUnmergedList", R"DOC(
         FetchUnmergedList is 2-D array of FetchType(boost::variant(LoDTensor, LoDTensorArray)).
         )DOC")
-      .def("_move_to_list",
-           [](FetchUnmergedList &self) -> py::list {
-             py::list res(self.size());
-             for (size_t i = 0; i < self.size(); ++i) {
-               py::list tmp(self[i].size());
-               for (size_t j = 0; j < self[i].size(); ++j) {
-                 if (data_is_lod_tensor(self[i][j])) {
-                   auto &var = BOOST_GET(LoDTensor, self[i][j]);
-                   tmp[j] = py::cast(std::move(var));
-                 } else {
-                   auto &var = BOOST_GET(LoDTensorArray, self[i][j]);
-                   py::list tmp_array(var.size());
-                   for (size_t k = 0; k < var.size(); ++k) {
-                     tmp_array[k] = std::move(var[k]);
-                   }
-                   tmp[j] = std::move(tmp_array);
-                 }
-               }
-               res[i] = std::move(tmp);
-               self[i].clear();
-             }
-             self.clear();
-             return res;
-           },
-           py::return_value_policy::take_ownership);
+      .def(
+          "_move_to_list",
+          [](FetchUnmergedList &self) -> py::list {
+            py::list res(self.size());
+            for (size_t i = 0; i < self.size(); ++i) {
+              py::list tmp(self[i].size());
+              for (size_t j = 0; j < self[i].size(); ++j) {
+                if (data_is_lod_tensor(self[i][j])) {
+                  auto &var = BOOST_GET(LoDTensor, self[i][j]);
+                  tmp[j] = py::cast(std::move(var));
+                } else {
+                  auto &var = BOOST_GET(LoDTensorArray, self[i][j]);
+                  py::list tmp_array(var.size());
+                  for (size_t k = 0; k < var.size(); ++k) {
+                    tmp_array[k] = std::move(var[k]);
+                  }
+                  tmp[j] = std::move(tmp_array);
+                }
+              }
+              res[i] = std::move(tmp);
+              self[i].clear();
+            }
+            self.clear();
+            return res;
+          },
+          py::return_value_policy::take_ownership);
 
   m.def("op_support_gpu", OpSupportGPU);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -3227,11 +3256,12 @@ All parameter, weight, gradient are variables in Paddle.
     }
     platform::EmptyCache();
   });
-  m.def("get_device_properties",
-        [](int id) -> const gpuDeviceProp & {
-          return platform::GetDeviceProperties(id);
-        },
-        py::return_value_policy::copy);
+  m.def(
+      "get_device_properties",
+      [](int id) -> const gpuDeviceProp & {
+        return platform::GetDeviceProperties(id);
+      },
+      py::return_value_policy::copy);
 
   py::class_<gpuDeviceProp>(m, "_gpuDeviceProperties")
       .def_property_readonly(
@@ -3409,15 +3439,16 @@ All parameter, weight, gradient are variables in Paddle.
              profiler->Prepare();
            })
       .def("start", &paddle::platform::Profiler::Start)
-      .def("stop",
-           [](paddle::platform::Profiler *profiler) {
-             platform::DisableHostEventRecorder();
-             auto result = profiler->Stop();
-             framework::StaticGraphExecutorPerfStatistics(
-                 result->GetNodeTrees());
-             return result;
-           },
-           py::return_value_policy::automatic_reference);
+      .def(
+          "stop",
+          [](paddle::platform::Profiler *profiler) {
+            platform::DisableHostEventRecorder();
+            auto result = profiler->Stop();
+            framework::StaticGraphExecutorPerfStatistics(
+                result->GetNodeTrees());
+            return result;
+          },
+          py::return_value_policy::automatic_reference);
 
   py::class_<paddle::platform::ProfilerOptions>(m, "ProfilerOptions")
       .def(py::init<>())
@@ -3666,11 +3697,12 @@ All parameter, weight, gradient are variables in Paddle.
           },
           R"DOC(This config that the this is distributed training with parameter server
               )DOC")
-      .def_property("_dry_run",
-                    [](const ExecutionStrategy &self) { return self.dry_run_; },
-                    [](ExecutionStrategy &self, bool dry_run) {
-                      self.dry_run_ = dry_run;
-                    });
+      .def_property(
+          "_dry_run",
+          [](const ExecutionStrategy &self) { return self.dry_run_; },
+          [](ExecutionStrategy &self, bool dry_run) {
+            self.dry_run_ = dry_run;
+          });
 
   exec_strategy.def_property(
       "use_experimental_executor",
@@ -3918,11 +3950,12 @@ All parameter, weight, gradient are variables in Paddle.
              const std::vector<std::string> &trainers_endpoints) {
             self.trainers_endpoints_ = trainers_endpoints;
           })
-      .def_property("trainer_id",
-                    [](const BuildStrategy &self) { return self.trainer_id_; },
-                    [](BuildStrategy &self, int trainer_id) {
-                      self.trainer_id_ = trainer_id;
-                    })
+      .def_property(
+          "trainer_id",
+          [](const BuildStrategy &self) { return self.trainer_id_; },
+          [](BuildStrategy &self, int trainer_id) {
+            self.trainer_id_ = trainer_id;
+          })
       .def_property(
           "nccl_comm_num",
           [](const BuildStrategy &self) { return self.nccl_comm_num_; },
@@ -3935,20 +3968,22 @@ All parameter, weight, gradient are variables in Paddle.
           [](BuildStrategy &self, int bkcl_comm_num) {
             self.bkcl_comm_num_ = bkcl_comm_num;
           })
-      .def_property("use_hierarchical_allreduce",
-                    [](const BuildStrategy &self) {
-                      return self.use_hierarchical_allreduce_;
-                    },
-                    [](BuildStrategy &self, bool use) {
-                      self.use_hierarchical_allreduce_ = use;
-                    })
-      .def_property("hierarchical_allreduce_inter_nranks",
-                    [](const BuildStrategy &self) {
-                      return self.hierarchical_allreduce_inter_nranks_;
-                    },
-                    [](BuildStrategy &self, int nranks) {
-                      self.hierarchical_allreduce_inter_nranks_ = nranks;
-                    })
+      .def_property(
+          "use_hierarchical_allreduce",
+          [](const BuildStrategy &self) {
+            return self.use_hierarchical_allreduce_;
+          },
+          [](BuildStrategy &self, bool use) {
+            self.use_hierarchical_allreduce_ = use;
+          })
+      .def_property(
+          "hierarchical_allreduce_inter_nranks",
+          [](const BuildStrategy &self) {
+            return self.hierarchical_allreduce_inter_nranks_;
+          },
+          [](BuildStrategy &self, int nranks) {
+            self.hierarchical_allreduce_inter_nranks_ = nranks;
+          })
 
       .def_property(
           "fuse_elewise_add_act_ops",
@@ -4107,19 +4142,20 @@ All parameter, weight, gradient are variables in Paddle.
                         build_strategy = static.BuildStrategy()
                         build_strategy.fuse_relu_depthwise_conv = True
           )DOC")
-      .def_property("fuse_broadcast_ops",
-                    [](const BuildStrategy &self) {
-                      return self.fuse_broadcast_ops_ == true ||
-                             self.fuse_broadcast_ops_ == paddle::none;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      PADDLE_ENFORCE_NE(self.IsFinalized(), true,
-                                        platform::errors::PreconditionNotMet(
-                                            "BuildStrategy has been finlaized, "
-                                            "cannot be configured again."));
-                      self.fuse_broadcast_ops_ = b;
-                    },
-                    R"DOC((bool, optional): fuse_broadcast_op indicates whether
+      .def_property(
+          "fuse_broadcast_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_broadcast_ops_ == true ||
+                   self.fuse_broadcast_ops_ == paddle::none;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, "
+                                  "cannot be configured again."));
+            self.fuse_broadcast_ops_ = b;
+          },
+          R"DOC((bool, optional): fuse_broadcast_op indicates whether
                       to fuse the broadcast ops. Note that, in Reduce mode,
                       fusing broadcast ops may make the program faster. Because
                       fusing broadcast OP equals delaying the execution of all
@@ -4137,18 +4173,19 @@ All parameter, weight, gradient are variables in Paddle.
                               build_strategy = static.BuildStrategy()
                               build_strategy.fuse_broadcast_ops = True
                     )DOC")
-      .def_property("fuse_all_optimizer_ops",
-                    [](const BuildStrategy &self) {
-                      return self.fuse_all_optimizer_ops_ == true ||
-                             self.fuse_all_optimizer_ops_ == paddle::none;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      PADDLE_ENFORCE_NE(self.IsFinalized(), true,
-                                        platform::errors::PreconditionNotMet(
-                                            "BuildStrategy has been finlaized, "
-                                            "cannot be configured again."));
-                      self.fuse_all_optimizer_ops_ = b;
-                    })
+      .def_property(
+          "fuse_all_optimizer_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_all_optimizer_ops_ == true ||
+                   self.fuse_all_optimizer_ops_ == paddle::none;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, "
+                                  "cannot be configured again."));
+            self.fuse_all_optimizer_ops_ = b;
+          })
       .def_property(
           "sync_batch_norm",
           [](const BuildStrategy &self) { return self.sync_batch_norm_; },
@@ -4231,9 +4268,10 @@ All parameter, weight, gradient are variables in Paddle.
             self.is_distribution_ = b;
 #endif
           })
-      .def_property("async_mode",
-                    [](const BuildStrategy &self) { return self.async_mode_; },
-                    [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
+      .def_property(
+          "async_mode",
+          [](const BuildStrategy &self) { return self.async_mode_; },
+          [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
       .def_property(
           "enable_inplace",
           [](const BuildStrategy &self) { return self.enable_inplace_; },
@@ -4249,13 +4287,14 @@ All parameter, weight, gradient are variables in Paddle.
                    self.fuse_all_reduce_ops_ == paddle::none;
           },
           [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
-      .def_property("enable_backward_optimizer_op_deps",
-                    [](const BuildStrategy &self) {
-                      return self.enable_backward_optimizer_op_deps_;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      self.enable_backward_optimizer_op_deps_ = b;
-                    })
+      .def_property(
+          "enable_backward_optimizer_op_deps",
+          [](const BuildStrategy &self) {
+            return self.enable_backward_optimizer_op_deps_;
+          },
+          [](BuildStrategy &self, bool b) {
+            self.enable_backward_optimizer_op_deps_ = b;
+          })
       .def_property(
           "cache_runtime_context",
           [](const BuildStrategy &self) { return self.cache_runtime_context_; },
@@ -4275,24 +4314,26 @@ All parameter, weight, gradient are variables in Paddle.
           [](BuildStrategy &self, bool fix_op_run_order) {
             self.fix_op_run_order_ = fix_op_run_order;
           })
-      .def_property("allow_cuda_graph_capture",
-                    [](const BuildStrategy &self) {
-                      return self.allow_cuda_graph_capture_;
-                    },
-                    [](BuildStrategy &self, bool allow_cuda_graph_capture) {
-                      self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
-                    })
+      .def_property(
+          "allow_cuda_graph_capture",
+          [](const BuildStrategy &self) {
+            return self.allow_cuda_graph_capture_;
+          },
+          [](BuildStrategy &self, bool allow_cuda_graph_capture) {
+            self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
+          })
       .def("_copy",
            [](const BuildStrategy &self) {
              auto new_bs = self;
              new_bs.ClearFinalized();
              return new_bs;
            })
-      .def("_finalize_strategy_and_create_passes",
-           [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
-             return self.CreatePassesFromStrategy(true);
-           },
-           R"DOC(Allow user to customized passes. Normally model-specific
+      .def(
+          "_finalize_strategy_and_create_passes",
+          [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
+            return self.CreatePassesFromStrategy(true);
+          },
+          R"DOC(Allow user to customized passes. Normally model-specific
                 optimization passes should be defined in this way. BuildStrategy
                 cannot be updated after being finalized.)DOC");
 
@@ -4310,11 +4351,12 @@ All parameter, weight, gradient are variables in Paddle.
       // We still cannot get local_scope from this vector, since the element
       // of vec<Scope*> will be freed by Python GC. We can only return Scope*
       // one by one and mark them as reference.
-      .def("local_scopes",
-           [](ParallelExecutor &self) -> std::vector<Scope *> * {
-             return &self.GetLocalScopes();
-           },
-           py::return_value_policy::reference)
+      .def(
+          "local_scopes",
+          [](ParallelExecutor &self) -> std::vector<Scope *> * {
+            return &self.GetLocalScopes();
+          },
+          py::return_value_policy::reference)
       .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes)
       .def("_need_create_local_exe_scopes",
            &ParallelExecutor::NeedCreateLocalExeScope)
@@ -4346,12 +4388,13 @@ All parameter, weight, gradient are variables in Paddle.
              std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>>(
       m, "IpuBackend")
       // manage IpuBackend in C++
-      .def("get_instance",
-           []() {
-             return std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>(
-                 platform::ipu::IpuBackend::GetInstance());
-           },
-           py::return_value_policy::reference)
+      .def(
+          "get_instance",
+          []() {
+            return std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>(
+                platform::ipu::IpuBackend::GetInstance());
+          },
+          py::return_value_policy::reference)
       .def("weights_to_host", &platform::ipu::IpuBackend::WeightsToHost)
       .def("detach", &platform::ipu::IpuBackend::Detach)
       .def("reset", &platform::ipu::IpuBackend::Reset)
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index e0aab0dd06e..3e779ba41c0 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/reader_py.h"
+
 #include <exception>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "Python.h"
 #include "boost/optional.hpp"
 #include "gflags/gflags.h"
@@ -337,32 +339,33 @@ void BindMultiDeviceReader(py::module *module, const char *reader_name) {
            py::call_guard<py::gil_scoped_release>())
       .def("read_next_list", &ReaderType::ReadNextList,
            py::call_guard<py::gil_scoped_release>())
-      .def("read_next_var_list",
-           [](ReaderType &self) {
-             auto result_list = self.ReadNextList();
-             auto &tensor_list = result_list[0];
-             std::vector<std::shared_ptr<imperative::VarBase>> var_list;
-             var_list.reserve(tensor_list.size());
-             auto func = [](framework::LoDTensor &lod_tensor) {
-               std::string act_name =
-                   imperative::GetCurrentTracer()->GenerateUniqueName(
-                       "generated_var");
-               auto new_var = std::make_shared<imperative::VarBase>(act_name);
-               new_var->SetPersistable(false);
-               new_var->SetType(framework::proto::VarType::LOD_TENSOR);
-               new_var->SetDataType(
-                   framework::TransToProtoVarType(lod_tensor.dtype()));
-               auto *tensor =
-                   new_var->MutableVar()->GetMutable<framework::LoDTensor>();
-               *tensor = std::move(lod_tensor);
-               return new_var;
-             };
-             for (auto &tensor : tensor_list) {
-               var_list.emplace_back(func(tensor));
-             }
-             return var_list;
-           },
-           py::call_guard<py::gil_scoped_release>())
+      .def(
+          "read_next_var_list",
+          [](ReaderType &self) {
+            auto result_list = self.ReadNextList();
+            auto &tensor_list = result_list[0];
+            std::vector<std::shared_ptr<imperative::VarBase>> var_list;
+            var_list.reserve(tensor_list.size());
+            auto func = [](framework::LoDTensor &lod_tensor) {
+              std::string act_name =
+                  imperative::GetCurrentTracer()->GenerateUniqueName(
+                      "generated_var");
+              auto new_var = std::make_shared<imperative::VarBase>(act_name);
+              new_var->SetPersistable(false);
+              new_var->SetType(framework::proto::VarType::LOD_TENSOR);
+              new_var->SetDataType(
+                  framework::TransToProtoVarType(lod_tensor.dtype()));
+              auto *tensor =
+                  new_var->MutableVar()->GetMutable<framework::LoDTensor>();
+              *tensor = std::move(lod_tensor);
+              return new_var;
+            };
+            for (auto &tensor : tensor_list) {
+              var_list.emplace_back(func(tensor));
+            }
+            return var_list;
+          },
+          py::call_guard<py::gil_scoped_release>())
       .def("reset", &ReaderType::Reset,
            py::call_guard<py::gil_scoped_release>())
       .def("shutdown", &ReaderType::Shutdown,
@@ -372,34 +375,35 @@ void BindMultiDeviceReader(py::module *module, const char *reader_name) {
 void BindReader(py::module *module) {
   auto &m = *module;
 
-  m.def("diff_tensor_shape", [](const framework::LoDTensor &tensor,
-                                const framework::VarDesc &var_desc,
-                                size_t num_places) -> py::object {
-    auto diff = DiffTensorShapeWithVarDesc(tensor, var_desc, num_places);
-    if (diff) {
-      return py::cast(std::move(diff.get()));
-    } else {
-      return py::cast(nullptr);
-    }
-  });
-
-  m.def("init_lod_tensor_blocking_queue",
-        [](framework::Variable &var, size_t capacity,
-           bool is_ordered) -> py::object {
-          VLOG(1) << "init_lod_tensor_blocking_queue";
-          if (is_ordered) {
-            auto *holder = var.GetMutable<
-                reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder>();
-            holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
-            return py::cast(holder->GetQueue());
-          } else {
-            auto *holder =
-                var.GetMutable<reader::LoDTensorBlockingQueueHolder>();
-            holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
-            return py::cast(holder->GetQueue());
-          }
-        },
-        py::return_value_policy::copy);
+  m.def(
+      "diff_tensor_shape",
+      [](const framework::LoDTensor &tensor, const framework::VarDesc &var_desc,
+         size_t num_places) -> py::object {
+        auto diff = DiffTensorShapeWithVarDesc(tensor, var_desc, num_places);
+        if (diff) {
+          return py::cast(std::move(diff.get()));
+        } else {
+          return py::cast(nullptr);
+        }
+      });
+
+  m.def(
+      "init_lod_tensor_blocking_queue",
+      [](framework::Variable &var, size_t capacity,
+         bool is_ordered) -> py::object {
+        VLOG(1) << "init_lod_tensor_blocking_queue";
+        if (is_ordered) {
+          auto *holder = var.GetMutable<
+              reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder>();
+          holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
+          return py::cast(holder->GetQueue());
+        } else {
+          auto *holder = var.GetMutable<reader::LoDTensorBlockingQueueHolder>();
+          holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
+          return py::cast(holder->GetQueue());
+        }
+      },
+      py::return_value_policy::copy);
 
   py::class_<framework::ReaderHolder>(m, "Reader", "")
       .def("start", &framework::ReaderHolder::Start)
@@ -408,12 +412,13 @@ void BindReader(py::module *module) {
   py::class_<reader::LoDTensorBlockingQueue,
              std::shared_ptr<reader::LoDTensorBlockingQueue>>(
       m, "LoDTensorBlockingQueue", "")
-      .def("push",
-           [](reader::LoDTensorBlockingQueue &self,
-              const std::vector<framework::LoDTensor> &lod_tensor_vec) {
-             return self.Push(lod_tensor_vec);
-           },
-           py::call_guard<py::gil_scoped_release>())
+      .def(
+          "push",
+          [](reader::LoDTensorBlockingQueue &self,
+             const std::vector<framework::LoDTensor> &lod_tensor_vec) {
+            return self.Push(lod_tensor_vec);
+          },
+          py::call_guard<py::gil_scoped_release>())
       .def("size", &reader::LoDTensorBlockingQueue::Size)
       .def("capacity", &reader::LoDTensorBlockingQueue::Cap)
       .def("close", &reader::LoDTensorBlockingQueue::Close)
@@ -424,12 +429,13 @@ void BindReader(py::module *module) {
   py::class_<reader::OrderedMultiDeviceLoDTensorBlockingQueue,
              std::shared_ptr<reader::OrderedMultiDeviceLoDTensorBlockingQueue>>(
       m, "OrderedMultiDeviceLoDTensorBlockingQueue", "")
-      .def("push",
-           [](reader::OrderedMultiDeviceLoDTensorBlockingQueue &self,
-              const std::vector<framework::LoDTensor> &lod_tensor_vec) {
-             return self.Push(lod_tensor_vec);
-           },
-           py::call_guard<py::gil_scoped_release>())
+      .def(
+          "push",
+          [](reader::OrderedMultiDeviceLoDTensorBlockingQueue &self,
+             const std::vector<framework::LoDTensor> &lod_tensor_vec) {
+            return self.Push(lod_tensor_vec);
+          },
+          py::call_guard<py::gil_scoped_release>())
       .def("size", &reader::OrderedMultiDeviceLoDTensorBlockingQueue::Size)
       .def("capacity", &reader::OrderedMultiDeviceLoDTensorBlockingQueue::Cap)
       .def("close", &reader::OrderedMultiDeviceLoDTensorBlockingQueue::Close)
@@ -444,19 +450,20 @@ void BindReader(py::module *module) {
   BindMultiDeviceReader<reader::OrderedMultiDeviceLoDTensorBlockingQueue>(
       module, "OrderedMultiDeviceFeedReader");
 
-  m.def("create_py_reader",
-        [](const std::shared_ptr<reader::LoDTensorBlockingQueue> &queue,
-           const std::vector<std::string> &names,
-           const std::vector<std::vector<int>> &shapes,
-           const std::vector<framework::proto::VarType::Type> &dtypes,
-           const std::vector<bool> &need_check_feed,
-           const std::vector<platform::Place> &dst_places,
-           bool use_double_buffer, bool drop_last, bool pin_memory) {
-          return new MultiDeviceFeedReader<reader::LoDTensorBlockingQueue>(
-              queue, names, shapes, dtypes, need_check_feed, dst_places,
-              use_double_buffer, drop_last, pin_memory);
-        },
-        py::return_value_policy::take_ownership);
+  m.def(
+      "create_py_reader",
+      [](const std::shared_ptr<reader::LoDTensorBlockingQueue> &queue,
+         const std::vector<std::string> &names,
+         const std::vector<std::vector<int>> &shapes,
+         const std::vector<framework::proto::VarType::Type> &dtypes,
+         const std::vector<bool> &need_check_feed,
+         const std::vector<platform::Place> &dst_places, bool use_double_buffer,
+         bool drop_last, bool pin_memory) {
+        return new MultiDeviceFeedReader<reader::LoDTensorBlockingQueue>(
+            queue, names, shapes, dtypes, need_check_feed, dst_places,
+            use_double_buffer, drop_last, pin_memory);
+      },
+      py::return_value_policy::take_ownership);
 
   m.def(
       "create_py_reader",
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
index add332abd30..109f3e5705b 100644
--- a/paddle/fluid/pybind/slice_utils.h
+++ b/paddle/fluid/pybind/slice_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <Python.h>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/operators/utils.h"
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 63b36bd9173..ed7ce64032b 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -15,12 +15,14 @@ limitations under the License. */
 #pragma once
 
 #include <Python.h>
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
diff --git a/paddle/fluid/pybind/uva_utils.h b/paddle/fluid/pybind/uva_utils.h
index 94f55769b73..3ea3d7ee1a7 100644
--- a/paddle/fluid/pybind/uva_utils.h
+++ b/paddle/fluid/pybind/uva_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <Python.h>
+
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h
index 45fe89e8b5b..d161b2a912f 100644
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -17,6 +17,6 @@
 #include <sstream>
 #include <string>
 #include <utility>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/utils/string/pretty_log.h"
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index f0bf46567a5..2f4bbd5df35 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include "paddle/infrt/api/infrt_api.h"
 
 #include <llvm/ADT/SmallVector.h>
@@ -61,6 +62,7 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 #include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
 #endif
+// clang-format on
 
 using namespace infrt::host_context;  // NOLINT
 using namespace infrt::tensor;        // NOLINT
diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h
index 2af1fab1008..880d1f03d87 100644
--- a/paddle/infrt/backends/host/phi_context.h
+++ b/paddle/infrt/backends/host/phi_context.h
@@ -35,12 +35,12 @@ class CpuPhiContext : public ::phi::CPUContext {
 class GpuPhiContext : public ::phi::GPUContext {
  public:
   using Base = ::phi::GPUContext;
-  using ::phi::GPUContext::SetStream;
-  using ::phi::GPUContext::SetEigenDevice;
   using ::phi::GPUContext::SetBlasHandle;
   using ::phi::GPUContext::SetDnnHandle;
+  using ::phi::GPUContext::SetEigenDevice;
   using ::phi::GPUContext::SetSolverHandle;
   using ::phi::GPUContext::SetSparseHandle;
+  using ::phi::GPUContext::SetStream;
 };
 
 }  // namespace backends
diff --git a/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.cu b/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.cu
index 5a53777c8e3..f3e2fe35074 100644
--- a/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.cu
@@ -199,8 +199,8 @@ bool PoolPlugin::isOutputBroadcastAcrossBatch(int32_t outputIndex,
   return false;
 }
 
-bool PoolPlugin::canBroadcastInputAcrossBatch(int32_t inputIndex) const
-    noexcept {
+bool PoolPlugin::canBroadcastInputAcrossBatch(
+    int32_t inputIndex) const noexcept {
   return false;
 }
 
diff --git a/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.h b/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.h
index 0da1d158453..34189f95438 100644
--- a/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.h
+++ b/paddle/infrt/backends/tensorrt/plugin/pool_op_plugin.h
@@ -114,10 +114,10 @@ class PoolPlugin : public nvinfer1::IPluginV2IOExt {
   char const* getPluginNamespace() const noexcept override;
 
   // IPluginV2Ext methods
-  nvinfer1::DataType getOutputDataType(int32_t index,
-                                       nvinfer1::DataType const* inputTypes,
-                                       int32_t nbInputs) const
-      noexcept override;
+  nvinfer1::DataType getOutputDataType(
+      int32_t index,
+      nvinfer1::DataType const* inputTypes,
+      int32_t nbInputs) const noexcept override;
   bool isOutputBroadcastAcrossBatch(int32_t outputIndex,
                                     bool const* inputIsBroadcasted,
                                     int32_t nbInputs) const noexcept override;
diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
index 89dd3b0dc7a..7e081362f9c 100644
--- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <math.h>
-
 #include <NvInfer.h>
 #include <NvInferRuntime.h>
 #include <NvInferRuntimeCommon.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <math.h>
+
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
index a2d49546189..a539078e4af 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -18,6 +18,7 @@
 #include <NvInferRuntime.h>
 #include <NvInferRuntimeCommon.h>
 #include <glog/logging.h>
+
 #include "paddle/phi/backends/dynload/tensorrt.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
index 41d11a71117..44f36a84cb5 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.h
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -17,6 +17,7 @@
 
 #include <NvInfer.h>
 #include <NvInferRuntime.h>
+
 #include "paddle/infrt/backends/tensorrt/trt_options.h"
 #include "paddle/infrt/backends/tensorrt/trt_utils.h"
 #include "paddle/phi/backends/dynload/tensorrt.h"
diff --git a/paddle/infrt/backends/tensorrt/trt_options.h b/paddle/infrt/backends/tensorrt/trt_options.h
index d5190f5e622..b4e36da2058 100644
--- a/paddle/infrt/backends/tensorrt/trt_options.h
+++ b/paddle/infrt/backends/tensorrt/trt_options.h
@@ -15,12 +15,12 @@
 
 #pragma once
 
+#include <NvInfer.h>
+
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include <NvInfer.h>
-
 namespace infrt {
 namespace backends {
 namespace tensorrt {
diff --git a/paddle/infrt/common/global.h b/paddle/infrt/common/global.h
index e6586cb3a3c..2d7735d5252 100644
--- a/paddle/infrt/common/global.h
+++ b/paddle/infrt/common/global.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <mlir/IR/MLIRContext.h>
+
 #include "paddle/infrt/tensor/dense_host_tensor.h"
 
 namespace infrt {
diff --git a/paddle/infrt/common/memory.h b/paddle/infrt/common/memory.h
index 678529b8b78..643b2147761 100644
--- a/paddle/infrt/common/memory.h
+++ b/paddle/infrt/common/memory.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #include <glog/logging.h>
-#include <unordered_map>
 
 #include <memory>
+#include <unordered_map>
 
 #include "paddle/infrt/common/macros.h"
 #include "paddle/infrt/common/target.h"
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index 7fbd1e8a4ef..8dec818a80a 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+
+// clang-format off
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
@@ -25,3 +27,4 @@
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/dense_tensor.hpp.inc"
+// clang-format on
diff --git a/paddle/infrt/dialect/diagnostic_utils.cc b/paddle/infrt/dialect/diagnostic_utils.cc
index 4151001067e..8785ce69b8e 100644
--- a/paddle/infrt/dialect/diagnostic_utils.cc
+++ b/paddle/infrt/dialect/diagnostic_utils.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/dialect/diagnostic_utils.h"
 
 #include <llvm/Support/raw_ostream.h>
+
 #include <string>
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
index c4f20cb4d35..0e3a10270cd 100644
--- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 
 #include <llvm/ADT/TypeSwitch.h>
@@ -60,6 +61,7 @@ void InfrtDialect::initialize() {
 #include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc"
       >();
 }
+// clang-format on
 
 /// Parse a type registered to this dialect.
 mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.h b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h
index e2e9b9348eb..5a7c45b3205 100644
--- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.h
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h
@@ -23,8 +23,8 @@
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/infrt/common/types.h"
 
+#include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.h.inc"
 #define GET_TYPEDEF_CLASSES
 #include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.h.inc"
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
index 63be5ca9095..309e0f8b940 100644
--- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
 
 #include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 namespace {
diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc
index 8da34bd404b..c204f9ea626 100644
--- a/paddle/infrt/dialect/init_dialects.cc
+++ b/paddle/infrt/dialect/init_dialects.cc
@@ -19,12 +19,10 @@
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
-
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 #include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
-
 #include "paddle/infrt/dialect/tensor_shape.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
index 19b8cba12df..ab533a25c41 100644
--- a/paddle/infrt/dialect/mlir_loader.cc
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -20,10 +20,10 @@
 #include <mlir/IR/Diagnostics.h>
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
-#include <unordered_map>
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
diff --git a/paddle/infrt/dialect/mlir_loader.h b/paddle/infrt/dialect/mlir_loader.h
index 5e50ad9e5a2..b4faba8068e 100644
--- a/paddle/infrt/dialect/mlir_loader.h
+++ b/paddle/infrt/dialect/mlir_loader.h
@@ -16,9 +16,9 @@
 
 #include <glog/logging.h>
 #include <mlir/IR/BuiltinOps.h>
-#include <string>
 
 #include <memory>
+#include <string>
 
 namespace infrt {
 namespace dialect {
diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc
index 2006530958f..e57666ffca0 100644
--- a/paddle/infrt/dialect/opt.cc
+++ b/paddle/infrt/dialect/opt.cc
@@ -14,6 +14,7 @@
 
 #include <mlir/Support/MlirOptMain.h>
 #include <mlir/Transforms/Passes.h>
+
 #include "paddle/infrt/dialect/init_dialects.h"
 
 int main(int argc, char **argv) {
diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
index 8bdf957db27..c9247abe695 100644
--- a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
@@ -14,6 +14,7 @@
 #include "paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h"  // NOLINT
 
 #include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace {
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
index 9a92558daab..f7358db5bf3 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+// clang-format off
 #include <mlir/Dialect/Traits.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
@@ -37,3 +38,4 @@
 // NOLINT
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc"
+// clang-format on
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc
index 1bd6068d3fb..39a23529ac3 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_base.cc
@@ -21,6 +21,7 @@
 #include <mlir/IR/MLIRContext.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/IR/Types.h>
+
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.cpp.inc"
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h
index 64cd08cc05e..2cbdef5af90 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.h
+++ b/paddle/infrt/dialect/phi/ir/phi_base.h
@@ -18,8 +18,8 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #include <string>
-#include "paddle/infrt/dialect/infrt/common/types.h"
 
+#include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"
 
 #define GET_OP_CLASSES
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.cc b/paddle/infrt/dialect/phi/ir/phi_kernels.cc
index c7a837b83fc..69c3f963391 100644
--- a/paddle/infrt/dialect/phi/ir/phi_kernels.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+
 #include <mlir/IR/BuiltinTypes.h>
 
 #include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.cpp.inc"
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc"  // NOLINT
-
 #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.cpp.inc"
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h
index 4f8b41852cc..9321ebb148f 100644
--- a/paddle/infrt/dialect/phi/ir/phi_kernels.h
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h
@@ -32,11 +32,9 @@
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
-
 #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc"
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.h.inc"
-
 #include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.h.inc"
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.h.inc"
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index 9425a290142..ff870a06752 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+
 #include <glog/logging.h>
+
 #include "paddle/infrt/dialect/phi/data_type.h"
 #include "paddle/phi/core/type_defs.h"
 #include "paddle/phi/kernels/declarations.h"
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
index cdc8f7cbff5..4385d3c9417 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/infrt/dialect/infrt/common/types.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
index bd5f0799a60..24af0ea4378 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <vector>
 
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
 #include "paddle/phi/kernels/declarations.h"
+// clang-format on
 
 namespace infrt {
 
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
index 862c9ae4ee5..f4de56b42a6 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
@@ -20,6 +20,7 @@
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Operation.h>
 #include <mlir/IR/OperationSupport.h>
+
 #include <list>
 #include <unordered_set>
 #include <vector>
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
index a0e74426a40..9748e1679d3 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <mlir/Pass/Pass.h>
+
 #include "paddle/infrt/dialect/infrt/common/types.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index 7cb2651ccf6..30bde83cd81 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <mlir/IR/Operation.h>
+
 #include <unordered_map>
+
 #include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 
diff --git a/paddle/infrt/dialect/phi/phi_exec.cc b/paddle/infrt/dialect/phi/phi_exec.cc
index a2808a00cb6..0aae8cc9337 100644
--- a/paddle/infrt/dialect/phi/phi_exec.cc
+++ b/paddle/infrt/dialect/phi/phi_exec.cc
@@ -41,7 +41,9 @@ bool parse_inputs(int argc,
       *params_file_name = argv[2];
       return true;
     }
-    default: { return false; }
+    default: {
+      return false;
+    }
   }
 }
 
diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc
index b118a5f7a9c..a240cebe736 100644
--- a/paddle/infrt/dialect/print_ir.cc
+++ b/paddle/infrt/dialect/print_ir.cc
@@ -28,6 +28,7 @@
 #include <mlir/Pass/PassManager.h>
 #include <mlir/Support/LogicalResult.h>
 #include <mlir/Transforms/Passes.h>
+
 #include <iostream>
 
 #include "paddle/infrt/common/global.h"
@@ -74,8 +75,8 @@ void printOperation(mlir::Operation *op, int indent) {
   if (!op->getAttrs().empty()) {
     printIndent(indent) << op->getAttrs().size() << " attributes:\n";
     for (mlir::NamedAttribute attr : op->getAttrs()) {
-      printIndent(indent + 1) << "- {" << attr.getName() << " : "
-                              << attr.getValue() << "}\n";
+      printIndent(indent + 1)
+          << "- {" << attr.getName() << " : " << attr.getValue() << "}\n";
     }
   }
 
diff --git a/paddle/infrt/dialect/tensor_shape.cc b/paddle/infrt/dialect/tensor_shape.cc
index 92c03818264..9a825224f1d 100644
--- a/paddle/infrt/dialect/tensor_shape.cc
+++ b/paddle/infrt/dialect/tensor_shape.cc
@@ -66,5 +66,4 @@ void TensorShapeDialect::printType(mlir::Type type,
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/tensor_shape.cpp.inc"  // NOLINT
-
 #include "paddle/infrt/dialect/tensor_shape_dialect.cpp.inc"
diff --git a/paddle/infrt/dialect/tensorrt/convert.h b/paddle/infrt/dialect/tensorrt/convert.h
index 2a242ca285b..2dcd86486f5 100644
--- a/paddle/infrt/dialect/tensorrt/convert.h
+++ b/paddle/infrt/dialect/tensorrt/convert.h
@@ -20,6 +20,7 @@
 #include <mlir/IR/BuiltinAttributes.h>
 #include <mlir/IR/PatternMatch.h>
 #include <mlir/Transforms/DialectConversion.h>
+
 #include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc
index dcb84ceb50e..899e71f1c99 100644
--- a/paddle/infrt/dialect/tensorrt/trt_exec.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc
@@ -11,10 +11,14 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+// clang-format off
 #include <llvm/Support/CommandLine.h>
 #include <mlir/Pass/PassManager.h>
+
 #include <iostream>
 #include <string>
+
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
@@ -44,6 +48,7 @@
 #endif
 
 #include <mlir/Transforms/Passes.h>
+// clang-format on
 
 int main(int argc, char** argv) {
   static llvm::cl::opt<std::string> input_file(
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index bbe9a76e87b..7109fc772ec 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <llvm/ADT/SetVector.h>
 #include <mlir/Analysis/SliceAnalysis.h>
 #include <mlir/IR/Builders.h>
+
 #include <list>
 #include <unordered_set>
 #include <vector>
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index d5ce871edd1..d74fe3e5e9c 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
 
 #include <mlir/IR/Builders.h>
+
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index d7b917385cf..35b869fb307 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -16,6 +16,7 @@
 
 #include <llvm/Support/Casting.h>
 #include <mlir/IR/Builders.h>
+
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index 415a78a6967..161fbbbcc65 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -11,6 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+// clang-format off
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 #include <mlir/IR/DialectImplementation.h>
 #include <mlir/IR/Matchers.h>
@@ -24,6 +26,7 @@
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
+// clang-format on
 
 namespace infrt {
 namespace trt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index 76768037dbd..e851c26c43c 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -28,6 +28,7 @@
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/pd/ir/pd_ops.h"
diff --git a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc
index 35c81d02301..1cb7c4155b9 100644
--- a/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_type_convert_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
 
 #include <glog/logging.h>
+
 #include <set>
 
 #include "llvm/ADT/StringRef.h"
diff --git a/paddle/infrt/host_context/core_runtime.cc b/paddle/infrt/host_context/core_runtime.cc
index e3917bd07d2..3dbb0b41c9f 100644
--- a/paddle/infrt/host_context/core_runtime.cc
+++ b/paddle/infrt/host_context/core_runtime.cc
@@ -14,9 +14,8 @@
 
 #include "paddle/infrt/host_context/core_runtime.h"
 
-#include <unordered_map>
-
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "paddle/infrt/host_context/kernel_registry.h"
diff --git a/paddle/infrt/host_context/core_runtime.h b/paddle/infrt/host_context/core_runtime.h
index acb6a66cac6..585369e249b 100644
--- a/paddle/infrt/host_context/core_runtime.h
+++ b/paddle/infrt/host_context/core_runtime.h
@@ -46,7 +46,7 @@ class CoreRuntime : public std::enable_shared_from_this<CoreRuntime> {
 
   //! Get the results of the execution.
   llvm::SmallVector<ValueRef, 4>  //
-      GetResults(llvm::ArrayRef<std::string> arg_names);
+  GetResults(llvm::ArrayRef<std::string> arg_names);
 
   std::shared_ptr<CoreRuntime> getptr() {
     return std::shared_ptr<CoreRuntime>(this);
diff --git a/paddle/infrt/host_context/kernel_registry.cc b/paddle/infrt/host_context/kernel_registry.cc
index 5693e973a3f..2518056ba9d 100644
--- a/paddle/infrt/host_context/kernel_registry.cc
+++ b/paddle/infrt/host_context/kernel_registry.cc
@@ -39,8 +39,8 @@ const std::vector<const char *> &KernelRegistry::GetAttrNameList(
 void KernelRegistry::AddKernel(const std::string &key,
                                KernelImplementation fn,
                                const std::vector<const char *> &attr_order) {
-  CHECK(!impl_->data.count(key)) << "kernel [" << key
-                                 << "] is registered twice";
+  CHECK(!impl_->data.count(key))
+      << "kernel [" << key << "] is registered twice";
   impl_->data.emplace(
       key, std::make_pair([fn]() { return fn; }, std::move(attr_order)));
 }
@@ -48,8 +48,8 @@ void KernelRegistry::AddKernel(const std::string &key,
 void KernelRegistry::AddKernel(const std::string &key,
                                KernelLauncher fn,
                                const std::vector<const char *> &attr_order) {
-  CHECK(!impl_->data.count(key)) << "kernel [" << key
-                                 << "] is registered twice";
+  CHECK(!impl_->data.count(key))
+      << "kernel [" << key << "] is registered twice";
   impl_->data.emplace(key,
                       std::make_pair(std::move(fn), std::move(attr_order)));
 }
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 6ad51a02bda..1ae7cdc742a 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -14,6 +14,7 @@
 
 #include <llvm/Support/CommandLine.h>
 #include <mlir/Pass/PassManager.h>
+
 #include <iostream>
 #include <string>
 
diff --git a/paddle/infrt/host_context/mlir_program_executor.h b/paddle/infrt/host_context/mlir_program_executor.h
index c2ccb90640b..7808c460457 100644
--- a/paddle/infrt/host_context/mlir_program_executor.h
+++ b/paddle/infrt/host_context/mlir_program_executor.h
@@ -19,10 +19,10 @@
 #include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
 #include <mlir/IR/OperationSupport.h>
-#include <unordered_map>
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 
 #include "paddle/infrt/host_context/core_runtime.h"
 #include "paddle/infrt/host_context/kernel_registry.h"
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 05bb28b7c56..9292e593a70 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
 
+#include <glog/logging.h>
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
 #include <mlir/IR/BuiltinAttributes.h>
@@ -23,7 +24,6 @@
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
 
-#include <glog/logging.h>
 #include <iostream>
 #include <memory>
 #include <string>
@@ -591,8 +591,8 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
   {
     // lookup the callee function
     auto it = table.find(callee_name.getValue().str());
-    CHECK(it != table.end()) << "can't find function ["
-                             << callee_name.getValue().str() << "]";
+    CHECK(it != table.end())
+        << "can't find function [" << callee_name.getValue().str() << "]";
     auto* function =
         impl_->cur_op->CreateFunctionExecutable(it->second, &impl_->func_defs);
     impl_->cur_op->AppendAttribute(new Value(function));
diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc
index 4d588a9c2b5..b53dc0545c7 100644
--- a/paddle/infrt/host_context/op_executable.cc
+++ b/paddle/infrt/host_context/op_executable.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/host_context/op_executable.h"
 
 #include <mlir/IR/BuiltinOps.h>
+
 #include <string>
 #include <unordered_set>
 
diff --git a/paddle/infrt/host_context/op_executable.h b/paddle/infrt/host_context/op_executable.h
index 550f6ab6349..b80b99fd414 100644
--- a/paddle/infrt/host_context/op_executable.h
+++ b/paddle/infrt/host_context/op_executable.h
@@ -16,6 +16,7 @@
 #include <llvm/ADT/ArrayRef.h>
 #include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Region.h>
+
 #include <memory>
 #include <string>
 #include <unordered_map>
diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h
index 57bdc1b4857..629181cca3d 100644
--- a/paddle/infrt/host_context/paddle_mlir.h
+++ b/paddle/infrt/host_context/paddle_mlir.h
@@ -20,6 +20,7 @@
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/MLIRContext.h>
+
 #include <fstream>
 #include <iostream>
 #include <string>
diff --git a/paddle/infrt/host_context/paddle_mlir_converter.cc b/paddle/infrt/host_context/paddle_mlir_converter.cc
index a2808a00cb6..0aae8cc9337 100644
--- a/paddle/infrt/host_context/paddle_mlir_converter.cc
+++ b/paddle/infrt/host_context/paddle_mlir_converter.cc
@@ -41,7 +41,9 @@ bool parse_inputs(int argc,
       *params_file_name = argv[2];
       return true;
     }
-    default: { return false; }
+    default: {
+      return false;
+    }
   }
 }
 
diff --git a/paddle/infrt/host_context/symbol_table.h b/paddle/infrt/host_context/symbol_table.h
index 805215a78ce..8c79c78c690 100644
--- a/paddle/infrt/host_context/symbol_table.h
+++ b/paddle/infrt/host_context/symbol_table.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include <unordered_map>
-
 #include <memory>
+#include <unordered_map>
 
 #include "paddle/infrt/host_context/value.h"
 
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index 1834cb4c0db..af785c13349 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -159,15 +159,15 @@ class Value : public common::Object {
 
   template <typename T>
   const T& get() const {
-    CHECK(data.template is<T>()) << "typeid: " << data.index()
-                                 << " != " << ValueVariantType::IndexOf<T>;
+    CHECK(data.template is<T>())
+        << "typeid: " << data.index() << " != " << ValueVariantType::IndexOf<T>;
     return data.get<T>();
   }
 
   template <typename T>
   T& get() {
-    CHECK(data.template is<T>()) << "typeid: " << data.index()
-                                 << " != " << ValueVariantType::IndexOf<T>;
+    CHECK(data.template is<T>())
+        << "typeid: " << data.index() << " != " << ValueVariantType::IndexOf<T>;
     return data.get<T>();
   }
 
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index 95e25b243f3..8c49f47e7d8 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
+
 #include <memory>
+
 #include "llvm/Support/ErrorHandling.h"
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/common/string.h"
diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
index 2e40261f273..cb9640451f9 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/meta_tensor.h"
 
diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h b/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
index 277c4ad6b7a..531d77ba952 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <type_traits>
+
 #include "paddle/infrt/tensor/dense_host_tensor.h"
 #include "paddle/phi/core/dense_tensor.h"
 
diff --git a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
index d8702784720..bac25e0f437 100644
--- a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
+++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <llvm/ADT/SmallVector.h>
+
 #include <iostream>
 
 #include "paddle/infrt/backends/host/phi_context.h"
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
index c0f5ebb4a76..0ea68f2e835 100644
--- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
 #include <string>
 #include <unordered_set>
@@ -36,6 +37,7 @@
 #include "paddle/infrt/host_context/symbol_table.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
+// clang-format on
 
 namespace infrt {
 namespace kernel {
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.h b/paddle/infrt/kernel/tensorrt/trt_kernels.h
index bf23bd45c13..bf41c124a29 100644
--- a/paddle/infrt/kernel/tensorrt/trt_kernels.h
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.h
@@ -19,7 +19,6 @@
 #include <utility>
 
 #include "mlir/IR/Operation.h"
-
 #include "paddle/infrt/backends/tensorrt/trt_engine.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 
diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc
index bcf475d1bc0..e00afa4b790 100644
--- a/paddle/infrt/kernel/test_kernels.cc
+++ b/paddle/infrt/kernel/test_kernels.cc
@@ -92,11 +92,11 @@ class BenchmarkStats {
     std::sort(run_times_walltime_.begin(), run_times_walltime_.end());
     std::sort(run_times_cpu_.begin(), run_times_cpu_.end());
 
-    auto percentile = [](
-        double p, const std::vector<std::chrono::nanoseconds> &run_times) {
-      assert(p >= 0.0 && p <= 1.0);
-      return run_times[run_times.size() * p];
-    };
+    auto percentile =
+        [](double p, const std::vector<std::chrono::nanoseconds> &run_times) {
+          assert(p >= 0.0 && p <= 1.0);
+          return run_times[run_times.size() * p];
+        };
 
     // BM: prefix is added to make grepping results from lit output easier.
     std::string prefix;
diff --git a/paddle/infrt/paddle/scope.h b/paddle/infrt/paddle/scope.h
index 4ebf846374c..1f81d0914df 100644
--- a/paddle/infrt/paddle/scope.h
+++ b/paddle/infrt/paddle/scope.h
@@ -13,10 +13,9 @@
 // limitations under the License.
 
 #pragma once
-#include <unordered_map>
-
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "paddle/infrt/common/macros.h"
diff --git a/paddle/infrt/support/type_traits.h b/paddle/infrt/support/type_traits.h
index 341dabb7c1c..33a42fe37ea 100644
--- a/paddle/infrt/support/type_traits.h
+++ b/paddle/infrt/support/type_traits.h
@@ -115,7 +115,8 @@ struct nonesuch {
 
 template <class Default,
           class AlwaysVoid,
-          template <class...> class Op,
+          template <class...>
+          class Op,
           class... Args>
 struct detector : std::false_type {
   using value_t = std::false_type;
diff --git a/paddle/infrt/tests/models/test_abs.cc b/paddle/infrt/tests/models/test_abs.cc
index 89bbe78ffe2..aa5a2c6945b 100644
--- a/paddle/infrt/tests/models/test_abs.cc
+++ b/paddle/infrt/tests/models/test_abs.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include <gtest/gtest.h>
 #include <llvm/Support/CommandLine.h>
 #include <mlir/Pass/PassManager.h>
@@ -49,6 +50,7 @@
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 #include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+// clang-format on
 
 static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
     "shared_libs",
diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h
index a9475db8008..fa19714dde7 100644
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -317,25 +317,24 @@ using InferShapeFunc = std::vector<std::vector<int64_t>> (*)(
     const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
     const std::vector<paddle::any>& attrs);
 
-#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type)            \
-  template <typename... Tail>                                               \
-  struct InferShapeCallHelper<input_type, Tail...> {                        \
-    template <int in_idx,                                                   \
-              int vec_in_idx,                                               \
-              int attr_idx,                                                 \
-              typename... PreviousArgs>                                     \
-    static Return InferShape(                                               \
-        const std::vector<std::vector<int64_t>>& input_shapes,              \
-        const std::vector<std::vector<std::vector<int64_t>>>&               \
-            vec_input_shapes,                                               \
-        const std::vector<paddle::any>& attrs,                              \
-        const PreviousArgs&... pargs) {                                     \
-      input_type arg = input_shapes[in_idx];                                \
-      return InferShapeCallHelper<Tail...>::template InferShape<in_idx + 1, \
-                                                                vec_in_idx, \
-                                                                attr_idx>(  \
-          input_shapes, vec_input_shapes, attrs, pargs..., arg);            \
-    }                                                                       \
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type)     \
+  template <typename... Tail>                                        \
+  struct InferShapeCallHelper<input_type, Tail...> {                 \
+    template <int in_idx,                                            \
+              int vec_in_idx,                                        \
+              int attr_idx,                                          \
+              typename... PreviousArgs>                              \
+    static Return InferShape(                                        \
+        const std::vector<std::vector<int64_t>>& input_shapes,       \
+        const std::vector<std::vector<std::vector<int64_t>>>&        \
+            vec_input_shapes,                                        \
+        const std::vector<paddle::any>& attrs,                       \
+        const PreviousArgs&... pargs) {                              \
+      input_type arg = input_shapes[in_idx];                         \
+      return InferShapeCallHelper<Tail...>::                         \
+          template InferShape<in_idx + 1, vec_in_idx, attr_idx>(     \
+              input_shapes, vec_input_shapes, attrs, pargs..., arg); \
+    }                                                                \
   }
 
 #define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type)    \
@@ -397,10 +396,8 @@ struct InferShapeFuncImpl<Return (*)(Args...), impl_fn> {
       const std::vector<std::vector<int64_t>>& input_shapes,
       const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
       const std::vector<paddle::any>& attrs) {
-    return InferShapeCallHelper<Args..., TypeTag<int>>::template InferShape<0,
-                                                                            0,
-                                                                            0>(
-        input_shapes, vec_input_shapes, attrs);
+    return InferShapeCallHelper<Args..., TypeTag<int>>::
+        template InferShape<0, 0, 0>(input_shapes, vec_input_shapes, attrs);
   }
 
  private:
@@ -482,20 +479,19 @@ using InferDtypeFunc = std::vector<DataType> (*)(
     }                                                                        \
   }
 
-#define PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(input_type)            \
-  template <typename... Tail>                                                \
-  struct InferDtypeCallHelper<input_type, Tail...> {                         \
-    template <int in_idx, int vec_in_idx, typename... PreviousArgs>          \
-    static Return InferDtype(                                                \
-        const std::vector<DataType>& input_dtypes,                           \
-        const std::vector<std::vector<DataType>>& vec_input_dtypes,          \
-        const PreviousArgs&... pargs) {                                      \
-      input_type arg = vec_input_dtypes[vec_in_idx];                         \
-      return InferDtypeCallHelper<Tail...>::template InferDtype<in_idx,      \
-                                                                vec_in_idx + \
-                                                                    1>(      \
-          input_dtypes, vec_input_dtypes, pargs..., arg);                    \
-    }                                                                        \
+#define PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(input_type)   \
+  template <typename... Tail>                                       \
+  struct InferDtypeCallHelper<input_type, Tail...> {                \
+    template <int in_idx, int vec_in_idx, typename... PreviousArgs> \
+    static Return InferDtype(                                       \
+        const std::vector<DataType>& input_dtypes,                  \
+        const std::vector<std::vector<DataType>>& vec_input_dtypes, \
+        const PreviousArgs&... pargs) {                             \
+      input_type arg = vec_input_dtypes[vec_in_idx];                \
+      return InferDtypeCallHelper<Tail...>::                        \
+          template InferDtype<in_idx, vec_in_idx + 1>(              \
+              input_dtypes, vec_input_dtypes, pargs..., arg);       \
+    }                                                               \
   }
 
 template <typename F, F f>
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 3ef7763d57e..5ca7f2b51ed 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 
+#include "glog/logging.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
@@ -28,8 +29,6 @@ limitations under the License. */
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
 
-#include "glog/logging.h"
-
 namespace paddle {
 namespace experimental {
 
diff --git a/paddle/phi/api/lib/backend_set.h b/paddle/phi/api/lib/backend_set.h
index 2aa4f969221..93f8f05b74b 100644
--- a/paddle/phi/api/lib/backend_set.h
+++ b/paddle/phi/api/lib/backend_set.h
@@ -32,8 +32,9 @@ class BackendSet final {
  public:
   constexpr BackendSet() : bitset_(0) {}
   explicit constexpr BackendSet(Backend b)
-      : bitset_(b == Backend::UNDEFINED ? 0 : 1ULL << (static_cast<uint8_t>(b) -
-                                                       1)) {}
+      : bitset_(b == Backend::UNDEFINED
+                    ? 0
+                    : 1ULL << (static_cast<uint8_t>(b) - 1)) {}
 
   inline uint64_t bitset() const { return bitset_; }
 
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 12f7b8bba58..4803616812c 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// clang-format off
 #include "paddle/phi/api/lib/data_transform.h"
 
 #include "paddle/phi/api/lib/kernel_dispatch.h"
@@ -23,6 +24,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/transfer_layout_kernel.h"
 
 #include "paddle/fluid/framework/tensor_util.h"
+// clang-format on
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc
index 71ba8eaae2d..0b93c96e7f8 100644
--- a/paddle/phi/api/lib/sparse_api_custom_impl.cc
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 
 #include <memory>
+
 #include "glog/logging.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index a340c0fed10..74364d5ab03 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// clang-format off
 #include "paddle/phi/api/include/tensor.h"
 
 #include <memory>
@@ -34,6 +35,7 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_utils.h"
 
 #include "paddle/fluid/platform/stream/cuda_stream.h"
+// clang-format off
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc
index 85de3601fd9..5f8c2ed71e9 100644
--- a/paddle/phi/api/lib/tensor_copy.cc
+++ b/paddle/phi/api/lib/tensor_copy.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/api/lib/tensor_copy.h"
+
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/core/compat/convert_utils.h"
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index 5285392b4a6..fbeeb3332ea 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// clang-format off
 #include "paddle/phi/api/include/tensor.h"
 
 #include "paddle/phi/common/int_array.h"
@@ -22,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/infermeta/unary.h"
+// clang-format off
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h
index 36a0901bbe9..f930f5b11f6 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.h
+++ b/paddle/phi/api/lib/utils/tensor_utils.h
@@ -18,7 +18,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/variable.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
diff --git a/paddle/phi/backends/callback_manager.cc b/paddle/phi/backends/callback_manager.cc
index 4a958ef73bf..295f70fc65c 100644
--- a/paddle/phi/backends/callback_manager.cc
+++ b/paddle/phi/backends/callback_manager.cc
@@ -13,11 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/callback_manager.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/enforce.h"
 
 #include <ThreadPool.h>
 
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
 namespace phi {
 
 CallbackManager::CallbackManager(stream::Stream *stream)
diff --git a/paddle/phi/backends/custom/custom_context.h b/paddle/phi/backends/custom/custom_context.h
index 37b0ee21219..57be8534fa9 100644
--- a/paddle/phi/backends/custom/custom_context.h
+++ b/paddle/phi/backends/custom/custom_context.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/device_context.h"
 
diff --git a/paddle/phi/backends/custom/custom_device_test.cc b/paddle/phi/backends/custom/custom_device_test.cc
index 53b88f9b4ac..51fa74b4dc5 100644
--- a/paddle/phi/backends/custom/custom_device_test.cc
+++ b/paddle/phi/backends/custom/custom_device_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <string>
 
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc
index b72c6efd51f..e57653702c5 100644
--- a/paddle/phi/backends/device_base.cc
+++ b/paddle/phi/backends/device_base.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/device_base.h"
+
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
@@ -214,8 +215,9 @@ size_t DeviceInterface::AllocSize(size_t dev_id, bool realloc) {
   size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
                            : FLAGS_initial_gpu_memory_in_mb;
   size_t alloc_bytes =
-      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
-                                           FLAGS_fraction_of_gpu_memory_to_use);
+      (flag_mb > 0ul
+           ? flag_mb << 20
+           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
   PADDLE_ENFORCE_GE(available_to_alloc,
                     alloc_bytes,
                     phi::errors::ResourceExhausted(
diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h
index 749d8d323b6..ff58f4f35fd 100644
--- a/paddle/phi/backends/device_ext.h
+++ b/paddle/phi/backends/device_ext.h
@@ -34,7 +34,9 @@ typedef enum {
   C_INTERNAL_ERROR  // plugin error
 } C_Status;
 
-typedef struct C_Device_st { int id; } * C_Device;
+typedef struct C_Device_st {
+  int id;
+} * C_Device;
 
 typedef struct C_Stream_st* C_Stream;
 
diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h
index 18d51687ef1..56d99ba43bd 100644
--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -19,11 +19,10 @@
 
 #include "paddle/phi/backends/device_base.h"
 #include "paddle/phi/backends/device_ext.h"
+#include "paddle/phi/backends/dynload/port.h"
 #include "paddle/phi/backends/event.h"
 #include "paddle/phi/backends/stream.h"
 #include "paddle/phi/common/place.h"
-
-#include "paddle/phi/backends/dynload/port.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h
index ee0696fb4b2..308ae2accef 100644
--- a/paddle/phi/backends/dynload/cublas.h
+++ b/paddle/phi/backends/dynload/cublas.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cublasXt.h>
 #include <cublas_v2.h>
 #include <cuda.h>
+
 #include <mutex>  // NOLINT
 #include <type_traits>
 
diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h
index 4c7ac9c3f21..1e2a20ebdf4 100644
--- a/paddle/phi/backends/dynload/cublasLt.h
+++ b/paddle/phi/backends/dynload/cublasLt.h
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include <cublasLt.h>
 #include <cuda.h>
+
 #include <mutex>  // NOLINT
 #include <type_traits>
 
diff --git a/paddle/phi/backends/dynload/cuda_driver.h b/paddle/phi/backends/dynload/cuda_driver.h
index f4ea70a81b9..f743a33a186 100644
--- a/paddle/phi/backends/dynload/cuda_driver.h
+++ b/paddle/phi/backends/dynload/cuda_driver.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cuda.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/cudnn.cc b/paddle/phi/backends/dynload/cudnn.cc
index 02d626d5f98..8aa3b623273 100644
--- a/paddle/phi/backends/dynload/cudnn.cc
+++ b/paddle/phi/backends/dynload/cudnn.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cudnn.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h
index a3afb98e3e6..7b9004308e9 100644
--- a/paddle/phi/backends/dynload/cudnn.h
+++ b/paddle/phi/backends/dynload/cudnn.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/cufft.cc b/paddle/phi/backends/dynload/cufft.cc
index 596a68c1ed6..5a7080032d2 100644
--- a/paddle/phi/backends/dynload/cufft.cc
+++ b/paddle/phi/backends/dynload/cufft.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cufft.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
index 4697e335477..a27d7c3ab1e 100644
--- a/paddle/phi/backends/dynload/cufft.h
+++ b/paddle/phi/backends/dynload/cufft.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cufft.h>
 #include <cufftXt.h>
 #include <glog/logging.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h
index a526fbfd926..22e21b78f4f 100644
--- a/paddle/phi/backends/dynload/cupti.h
+++ b/paddle/phi/backends/dynload/cupti.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cuda.h>
 #include <cuda_occupancy.h>
 #include <cupti.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/curand.h b/paddle/phi/backends/dynload/curand.h
index 875403b03bb..f3c4496dc4d 100644
--- a/paddle/phi/backends/dynload/curand.h
+++ b/paddle/phi/backends/dynload/curand.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <curand.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/cusolver.h b/paddle/phi/backends/dynload/cusolver.h
index 40e5f183dc0..1354e310554 100644
--- a/paddle/phi/backends/dynload/cusolver.h
+++ b/paddle/phi/backends/dynload/cusolver.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cusolverDn.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h
index 8f7d54d55db..a7e305f98d4 100644
--- a/paddle/phi/backends/dynload/cusparse.h
+++ b/paddle/phi/backends/dynload/cusparse.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <cuda.h>
 #include <cusparse.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/hiprand.h b/paddle/phi/backends/dynload/hiprand.h
index ccaf02d9304..3e9502dd94d 100644
--- a/paddle/phi/backends/dynload/hiprand.h
+++ b/paddle/phi/backends/dynload/hiprand.h
@@ -16,9 +16,9 @@ limitations under the License. */
 #include <hiprand.h>
 
 #include <mutex>  // NOLINT
-#include "paddle/phi/backends/dynload/port.h"
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
 
 namespace phi {
 namespace dynload {
diff --git a/paddle/phi/backends/dynload/hiprtc.h b/paddle/phi/backends/dynload/hiprtc.h
index 0404aad5593..75dd88f87bd 100644
--- a/paddle/phi/backends/dynload/hiprtc.h
+++ b/paddle/phi/backends/dynload/hiprtc.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <hip/hiprtc.h>
+
 #include <mutex>  // NOLINT
+
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
 
diff --git a/paddle/phi/backends/dynload/lapack.cc b/paddle/phi/backends/dynload/lapack.cc
index bb03beabd4f..9719da97751 100644
--- a/paddle/phi/backends/dynload/lapack.cc
+++ b/paddle/phi/backends/dynload/lapack.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/backends/dynload/lapack.h"
+
 #include <mutex>
 
 namespace phi {
diff --git a/paddle/phi/backends/dynload/lapack.h b/paddle/phi/backends/dynload/lapack.h
index c81c66c6928..f0e1e9ad7a4 100644
--- a/paddle/phi/backends/dynload/lapack.h
+++ b/paddle/phi/backends/dynload/lapack.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <complex>
 #include <mutex>
+
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
diff --git a/paddle/phi/backends/dynload/miopen.cc b/paddle/phi/backends/dynload/miopen.cc
index e7916873ccf..9c58da1d6ff 100644
--- a/paddle/phi/backends/dynload/miopen.cc
+++ b/paddle/phi/backends/dynload/miopen.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/backends/dynload/miopen.h"
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/dynload/miopen.h b/paddle/phi/backends/dynload/miopen.h
index eb14bfe8ec5..eeaf8028ec3 100644
--- a/paddle/phi/backends/dynload/miopen.h
+++ b/paddle/phi/backends/dynload/miopen.h
@@ -14,10 +14,11 @@ limitations under the License. */
 
 #pragma once
 #include <glog/logging.h>
-
 #include <miopen/miopen.h>
 #include <miopen/version.h>
+
 #include <mutex>  // NOLINT
+
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
 
diff --git a/paddle/phi/backends/dynload/mklml.h b/paddle/phi/backends/dynload/mklml.h
index 5f5520a831e..0f0c31f8064 100644
--- a/paddle/phi/backends/dynload/mklml.h
+++ b/paddle/phi/backends/dynload/mklml.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <mkl.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/mklrt.h b/paddle/phi/backends/dynload/mklrt.h
index 8638d83d025..0267fb69a59 100644
--- a/paddle/phi/backends/dynload/mklrt.h
+++ b/paddle/phi/backends/dynload/mklrt.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <mkl_dfti.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/nccl.h b/paddle/phi/backends/dynload/nccl.h
index b04ef0f0651..6c73c562caa 100644
--- a/paddle/phi/backends/dynload/nccl.h
+++ b/paddle/phi/backends/dynload/nccl.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <nccl.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/nvjpeg.h b/paddle/phi/backends/dynload/nvjpeg.h
index 13bb8a5698f..6e71e6b582c 100644
--- a/paddle/phi/backends/dynload/nvjpeg.h
+++ b/paddle/phi/backends/dynload/nvjpeg.h
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 #include <nvjpeg.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/nvrtc.h b/paddle/phi/backends/dynload/nvrtc.h
index 516ca7686d2..9244e9487b2 100644
--- a/paddle/phi/backends/dynload/nvrtc.h
+++ b/paddle/phi/backends/dynload/nvrtc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <nvrtc.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/nvtx.h b/paddle/phi/backends/dynload/nvtx.h
index e9fd32668dc..a9a166b289e 100644
--- a/paddle/phi/backends/dynload/nvtx.h
+++ b/paddle/phi/backends/dynload/nvtx.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifndef _WIN32
 #include <cuda.h>
 #include <nvToolsExt.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/port.h b/paddle/phi/backends/dynload/port.h
index 981e5f5af64..d380993c9b6 100644
--- a/paddle/phi/backends/dynload/port.h
+++ b/paddle/phi/backends/dynload/port.h
@@ -28,6 +28,7 @@
 #include <dlfcn.h>  // dladdr
 #include <sys/stat.h>
 #include <sys/time.h>
+
 #include <algorithm>  // std::accumulate
 #else
 #ifndef NOMINMAX
@@ -40,6 +41,7 @@
 #include <stdio.h>
 #include <windows.h>
 #include <winsock.h>
+
 #include <numeric>  // std::accumulate in msvc
 #ifndef S_ISDIR     // windows port for sys/stat.h
 #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
diff --git a/paddle/phi/backends/dynload/rccl.h b/paddle/phi/backends/dynload/rccl.h
index 44726849628..2da35dc2df2 100644
--- a/paddle/phi/backends/dynload/rccl.h
+++ b/paddle/phi/backends/dynload/rccl.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <rccl.h>
 
 #include <mutex>  // NOLINT
+
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
 #include "paddle/phi/backends/dynload/port.h"
 
diff --git a/paddle/phi/backends/dynload/rocblas.h b/paddle/phi/backends/dynload/rocblas.h
index 18061b192e4..a9804b3d82a 100644
--- a/paddle/phi/backends/dynload/rocblas.h
+++ b/paddle/phi/backends/dynload/rocblas.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <hip/hip_runtime.h>
 #include <rocblas.h>
+
 #include <mutex>  // NOLINT
 #include <type_traits>
 
diff --git a/paddle/phi/backends/dynload/rocm_driver.h b/paddle/phi/backends/dynload/rocm_driver.h
index 59e35b787a5..4e456db44c9 100644
--- a/paddle/phi/backends/dynload/rocm_driver.h
+++ b/paddle/phi/backends/dynload/rocm_driver.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <hip/hip_runtime.h>
+
 #include <mutex>  // NOLINT
 
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
diff --git a/paddle/phi/backends/dynload/tensorrt.cc b/paddle/phi/backends/dynload/tensorrt.cc
index cc3b4e01460..45525701020 100644
--- a/paddle/phi/backends/dynload/tensorrt.cc
+++ b/paddle/phi/backends/dynload/tensorrt.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/phi/backends/dynload/tensorrt.h"
+
 #include <string>
 
 namespace phi {
diff --git a/paddle/phi/backends/event.cc b/paddle/phi/backends/event.cc
index a474536f865..43077d280f3 100644
--- a/paddle/phi/backends/event.cc
+++ b/paddle/phi/backends/event.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/event.h"
+
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/phi/backends/device_guard.h"
 #include "paddle/phi/backends/stream.h"
diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
index 08670832c77..c62addfd257 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
@@ -60,7 +60,7 @@ namespace gpu {
  *      }
  *    }
  *
-*/
+ */
 
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
   int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index e8c264b884f..f51f287ee4a 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include <mutex>
 
 #include "glog/logging.h"
-
 #include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index db9f287041d..5246155131d 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <array>
 #include <functional>
 #include <mutex>
+
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_helper.h"
diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h
index 443830acf47..323565c000a 100644
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include <stddef.h>
+
 #include <array>
 #include <string>
 #include <vector>
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index 888b44632ea..2dd1431ff58 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -25,9 +25,11 @@
 #endif
 
 #include <stddef.h>
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/enforce.h"
 
@@ -95,9 +97,9 @@ struct GpuLaunchConfig {
 };
 
 /* According to NVIDIA, if number of threads per block is 64/128/256/512,
-  * cuda performs better. And number of blocks should be greater (at least
-  * 2x~4x) than number of SMs. Hence, SM count is took into account within
-  * this function to determine the right number of threads per block. */
+ * cuda performs better. And number of blocks should be greater (at least
+ * 2x~4x) than number of SMs. Hence, SM count is took into account within
+ * this function to determine the right number of threads per block. */
 inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
                                             int64_t numel,
                                             int vec_size = 1) {
diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h
index 07ccb621540..7bec5eebf58 100644
--- a/paddle/phi/backends/gpu/gpu_resources.h
+++ b/paddle/phi/backends/gpu/gpu_resources.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <array>
+
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/common/place.h"
 
diff --git a/paddle/phi/backends/gpu/rocm/rocm_helper.h b/paddle/phi/backends/gpu/rocm/rocm_helper.h
index 2d75b6ea4cb..14e9ca660bd 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_helper.h
+++ b/paddle/phi/backends/gpu/rocm/rocm_helper.h
@@ -60,7 +60,7 @@ namespace gpu {
  *      }
  *    }
  *
-*/
+ */
 
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
   int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc
index 23e58d34b25..b89d5a3c162 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_info.cc
+++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <array>
+
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
 // TODO(phi): remove fluid headers.
diff --git a/paddle/phi/backends/stream.cc b/paddle/phi/backends/stream.cc
index 30939f31fcc..f8b15bdbd9e 100644
--- a/paddle/phi/backends/stream.cc
+++ b/paddle/phi/backends/stream.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/stream.h"
+
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/phi/backends/device_guard.h"
 #include "paddle/phi/backends/event.h"
diff --git a/paddle/phi/backends/xpu/enforce_xpu.h b/paddle/phi/backends/xpu/enforce_xpu.h
index 29b048ead85..30095e3a007 100644
--- a/paddle/phi/backends/xpu/enforce_xpu.h
+++ b/paddle/phi/backends/xpu/enforce_xpu.h
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
 #include "xpu/bkcl.h"
 
-#include "paddle/fluid/platform/enforce.h"
-
 namespace phi {
 namespace backends {
 namespace xpu {
diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc
index 7cc9eb44bc4..dbff88c0a27 100644
--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -18,7 +18,6 @@
 
 #include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/common/place.h"
-
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
 #include "xpu/xdnn.h"
@@ -86,8 +85,8 @@ struct XPUContext::Impl {
   void Init() {
     owned_ = true;
     backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
-    LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
-                            << static_cast<int>(place_.device);
+    LOG_FIRST_N(WARNING, 1)
+        << "Please NOTE: xpu device: " << static_cast<int>(place_.device);
     context_ = xpu::create_context();
     xpu_version_ = backends::xpu::get_xpu_version(place_.device);
     SetL3Cache();
diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h
index b87489c567c..d39b3c9cc1f 100644
--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -15,12 +15,12 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
-#include "paddle/phi/backends/xpu/forwards.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/device_context.h"
 
+#include "paddle/phi/backends/xpu/forwards.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/device_context.h"
 
 namespace xpu = baidu::xpu::api;
 
diff --git a/paddle/phi/backends/xpu/xpu_header.h b/paddle/phi/backends/xpu/xpu_header.h
index 5337f78c642..1fe6f6d0779 100644
--- a/paddle/phi/backends/xpu/xpu_header.h
+++ b/paddle/phi/backends/xpu/xpu_header.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
-
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
 #include "xpu/xdnn.h"
diff --git a/paddle/phi/backends/xpu/xpu_info.h b/paddle/phi/backends/xpu/xpu_info.h
index b1056cdc4b1..9d5f073eaa8 100644
--- a/paddle/phi/backends/xpu/xpu_info.h
+++ b/paddle/phi/backends/xpu/xpu_info.h
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/common/place.h"
 
 namespace phi {
diff --git a/paddle/phi/common/data_type.h b/paddle/phi/common/data_type.h
index 1792cb93706..ef9b4250482 100644
--- a/paddle/phi/common/data_type.h
+++ b/paddle/phi/common/data_type.h
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
-
-#include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/common/pstring.h"
 
 namespace paddle {
diff --git a/paddle/phi/common/int_array.cc b/paddle/phi/common/int_array.cc
index daed2b6625a..81701ee010c 100644
--- a/paddle/phi/common/int_array.cc
+++ b/paddle/phi/common/int_array.cc
@@ -14,9 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/common/int_array.h"
 
-#include "paddle/phi/common/place.h"
-
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/place.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index 667d0a32b93..c15a17651b1 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "glog/logging.h"
-
 #include "paddle/phi/api/ext/exception.h"
 
 namespace phi {
diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc
index 41f1c954182..2954af086ac 100644
--- a/paddle/phi/common/scalar.cc
+++ b/paddle/phi/common/scalar.cc
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #include "paddle/phi/common/scalar.h"
 
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
-
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
 namespace paddle {
 namespace experimental {
 
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 8eb6524e79c..ae3b8924ece 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <unordered_set>
 
 #include "glog/logging.h"
-
 #include "paddle/phi/core/compat/arg_map_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/phi/core/ddim.h b/paddle/phi/core/ddim.h
index dd13081ddaf..794d7051aee 100644
--- a/paddle/phi/core/ddim.h
+++ b/paddle/phi/core/ddim.h
@@ -238,10 +238,10 @@ int arity(const DDim& ddim);
 std::ostream& operator<<(std::ostream&, const DDim&);
 
 /**
-* \brief Flatten dim to 3d
-* e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6})
-*       flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30}
-*/
+ * \brief Flatten dim to 3d
+ * e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6})
+ *       flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30}
+ */
 DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims);
 
 // Reshape a tensor to a matrix. The matrix's first dimension(column length)
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index 06d3e435bc1..09098705b11 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_meta.h"
 
 /* @jim19930609: Move to MKLDNN_Tensor in the future
-    */
+ */
 #ifdef PADDLE_WITH_MKLDNN
 #include "dnnl.hpp"
 #endif
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 8c97b6bf223..a59b910b7e0 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -12,15 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/core/dense_tensor.h"
-
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
-
 #include "paddle/phi/core/compat/convert_utils.h"
-
-#include "paddle/fluid/memory/malloc.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_utils.h"
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index 0f5f22b5bd1..ce57f4f627b 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/core/device_context.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/selected_rows.h"
diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h
index d7c2c777ca6..45e4fbf64dc 100644
--- a/paddle/phi/core/device_context.h
+++ b/paddle/phi/core/device_context.h
@@ -75,17 +75,17 @@ class PADDLE_API DeviceContext {
   void SetHostAllocator(const Allocator*);
 
   /**
-  * @brief Set the zero-size Allocator object.
-  *
-  * @param allocator
-  */
+   * @brief Set the zero-size Allocator object.
+   *
+   * @param allocator
+   */
   void SetZeroAllocator(const Allocator*);
 
   /**
-  * @brief Set the zero-size Allocator object.
-  *
-  * @param allocator
-  */
+   * @brief Set the zero-size Allocator object.
+   *
+   * @param allocator
+   */
   void SetPinnedAllocator(const Allocator*);
 
   /**
@@ -135,10 +135,10 @@ class PADDLE_API DeviceContext {
   virtual void Wait() const {}
 
   /**
-  * @brief Set the generator for special op.
-  *
-  * @param Generator
-  */
+   * @brief Set the generator for special op.
+   *
+   * @param Generator
+   */
   void SetGenerator(Generator*);
   /**
    * @brief Get the generator object.
@@ -148,10 +148,10 @@ class PADDLE_API DeviceContext {
   Generator* GetGenerator() const;
 
   /**
-  * @brief Set the host generator for special op.
-  *
-  * @param Generator
-  */
+   * @brief Set the host generator for special op.
+   *
+   * @param Generator
+   */
   void SetHostGenerator(Generator*);
   /**
    * @brief Get the host generator object.
diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc
index ae6b0135b32..91e0316ff75 100644
--- a/paddle/phi/core/enforce.cc
+++ b/paddle/phi/core/enforce.cc
@@ -14,13 +14,12 @@ limitations under the License. */
 
 #include "paddle/phi/core/enforce.h"
 
+#include <boost/variant.hpp>
 #include <map>
 #include <memory>
 #include <unordered_map>
 #include <vector>
 
-#include <boost/variant.hpp>
-
 // <boost/variant.hpp> is not suitable to be placed in the header file,
 // it will introduce a large number of unnecessary includes, and these type
 // declarations that depend on boost are also not suitable for the phi header
diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h
index 0869df14323..decebbe66a5 100644
--- a/paddle/phi/core/hostdevice.h
+++ b/paddle/phi/core/hostdevice.h
@@ -20,6 +20,7 @@
 
 #if defined(__xpu__)
 #include <xpu/runtime.h>
+
 #include "xpu/kernel/cluster_header.h"
 #include "xpu/kernel/debug.h"
 #include "xpu/kernel/math.h"
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index d479147f06b..d864544e10d 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -15,7 +15,6 @@
 #include "paddle/phi/core/kernel_factory.h"
 
 #include "glog/logging.h"
-
 #include "paddle/phi/core/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 41e1e2b53a9..65f655d5037 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -22,13 +22,12 @@
 #include <vector>
 
 #include "paddle/phi/core/custom_kernel.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/core/kernel_utils.h"
 #include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/type_defs.h"
 
-#include "paddle/phi/core/enforce.h"
-
 namespace phi {
 
 #define BACKEND(arg__) phi::Backend::arg__
@@ -58,16 +57,13 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
     for (auto arg_type : args_type) {
       if (arg_type == std::type_index(typeid(const CPUContext&))
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-          ||
-          arg_type == std::type_index(typeid(const GPUContext&))) {
+          || arg_type == std::type_index(typeid(const GPUContext&))) {
 #elif defined(PADDLE_WITH_XPU)
-          ||
-          arg_type == std::type_index(typeid(const XPUContext&))) {
+          || arg_type == std::type_index(typeid(const XPUContext&))) {
 #elif defined(PADDLE_WITH_CUSTOM_DEVICE)
-          ||
-          arg_type == std::type_index(typeid(const CustomContext&))) {
+          || arg_type == std::type_index(typeid(const CustomContext&))) {
 #else
-              ) {
+      ) {
 #endif
         // do nothing, skip context arg now
       } else if (arg_type == std::type_index(typeid(const DenseTensor&))) {
@@ -420,93 +416,93 @@ struct KernelRegistrar {
   PD_CONCATENATE(_PD_KERNEL_INSTANTIATION_, N)                             \
   (meta_kernel_fn, backend, context, __VA_ARGS__)
 
-#define _PD_KERNEL_INSTANTIATION_1(              \
-    meta_kernel_fn, backend, context, cpp_dtype) \
-  template decltype(                             \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>
-#define _PD_KERNEL_INSTANTIATION_2(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_1(                                       \
+#define _PD_KERNEL_INSTANTIATION_1(                     \
+    meta_kernel_fn, backend, context, cpp_dtype)        \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>
+#define _PD_KERNEL_INSTANTIATION_2(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_1(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_3(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_2(                                       \
+#define _PD_KERNEL_INSTANTIATION_3(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_2(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_4(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_3(                                       \
+#define _PD_KERNEL_INSTANTIATION_4(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_3(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_5(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_4(                                       \
+#define _PD_KERNEL_INSTANTIATION_5(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_4(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_6(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_5(                                       \
+#define _PD_KERNEL_INSTANTIATION_6(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_5(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_7(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_6(                                       \
+#define _PD_KERNEL_INSTANTIATION_7(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_6(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_8(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_7(                                       \
+#define _PD_KERNEL_INSTANTIATION_8(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_7(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_9(                                           \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_8(                                       \
+#define _PD_KERNEL_INSTANTIATION_9(                     \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_8(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_10(                                          \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_9(                                       \
+#define _PD_KERNEL_INSTANTIATION_10(                    \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_9(                 \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_11(                                          \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_10(                                      \
+#define _PD_KERNEL_INSTANTIATION_11(                    \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_10(                \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_12(                                          \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_11(                                      \
+#define _PD_KERNEL_INSTANTIATION_12(                    \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_11(                \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_13(                                          \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_12(                                      \
+#define _PD_KERNEL_INSTANTIATION_13(                    \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_12(                \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_14(                                          \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_13(                                      \
+#define _PD_KERNEL_INSTANTIATION_14(                    \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_13(                \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PD_KERNEL_INSTANTIATION_15(                                          \
-    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
-  template decltype(                                                          \
-      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PD_EXPAND(_PD_KERNEL_INSTANTIATION_14(                                      \
+#define _PD_KERNEL_INSTANTIATION_15(                    \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)   \
+  template decltype(meta_kernel_fn<cpp_dtype, context>) \
+      meta_kernel_fn<cpp_dtype, context>;               \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_14(                \
       meta_kernel_fn, backend, context, __VA_ARGS__))
 
 #define PD_KERNEL_REGISTRAR_INIT(reg_type,                   \
@@ -569,8 +565,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -592,8 +588,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -623,8 +619,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -654,8 +650,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -685,8 +681,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -716,8 +712,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -747,8 +743,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -778,8 +774,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -809,8 +805,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -840,8 +836,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -871,8 +867,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -902,8 +898,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -933,8 +929,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -964,8 +960,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
@@ -995,8 +991,8 @@ struct KernelRegistrar {
       #backend,                                                               \
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::phi::KernelArgsParseFunctor<decltype(                                 \
-          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      ::phi::KernelArgsParseFunctor<                                          \
+          decltype(&meta_kernel_fn<cpp_dtype, context>)>::Parse,              \
       args_def_fn,                                                            \
       PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
       PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index d4765d1c4c3..3b5fd0247a4 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -233,9 +233,8 @@ template <typename Return,
           Return (*kernel_fn)(DevCtx, Args...)>
 struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   static void Compute(KernelContext* ctx) {
-    KernelCallHelper<DevCtx,
-                     Args...,
-                     TypeTag<int>>::template Compute<0, 0, 0, 0>(ctx);
+    KernelCallHelper<DevCtx, Args..., TypeTag<int>>::
+        template Compute<0, 0, 0, 0>(ctx);
   }
 
   static void VariadicCompute(const DeviceContext& dev_ctx, Args... args) {
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index d277f32d8ea..27175916186 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
@@ -21,8 +22,6 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 
-#include "glog/logging.h"
-
 namespace phi {
 
 // TODO(chenweihang): add other flags if needed
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index 0a4e0d61915..20cbf3dffcb 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/string_tensor.h"
+
 #include "paddle/fluid/memory/malloc.h"
 
 namespace phi {
diff --git a/paddle/phi/core/tensor_base.cc b/paddle/phi/core/tensor_base.cc
index 1b3628906af..718bf09ff7e 100644
--- a/paddle/phi/core/tensor_base.cc
+++ b/paddle/phi/core/tensor_base.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/tensor_base.h"
+
 #include "paddle/phi/core/utils/type_registry.h"
 
 namespace phi {}
diff --git a/paddle/phi/core/utils/intrusive_ptr.h b/paddle/phi/core/utils/intrusive_ptr.h
index 2b758019253..e2e6cb7060d 100644
--- a/paddle/phi/core/utils/intrusive_ptr.h
+++ b/paddle/phi/core/utils/intrusive_ptr.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <utility>
+
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index a8d5ad564fe..f10fc54795d 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 63f0d0c1eeb..61c57981f94 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/multiary.h"
+
 #include <vector>
+
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 3c2888cee58..d84cc9e6d75 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/ternary.h"
+
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index 2349bf990ac..3d8e4db08bb 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -14,12 +14,11 @@
 
 #include "paddle/phi/kernels/assign_kernel.h"
 
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/utils/optional.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
-
 namespace phi {
 
 template <typename Context>
diff --git a/paddle/phi/kernels/auc_kernel.h b/paddle/phi/kernels/auc_kernel.h
index acbd17c7801..f58c3ce112b 100644
--- a/paddle/phi/kernels/auc_kernel.h
+++ b/paddle/phi/kernels/auc_kernel.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
 
diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h
index eaf325dad75..e18b854cf34 100644
--- a/paddle/phi/kernels/autotune/auto_tune_base.h
+++ b/paddle/phi/kernels/autotune/auto_tune_base.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <type_traits>
+
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
diff --git a/paddle/phi/kernels/autotune/auto_tune_test.cu b/paddle/phi/kernels/autotune/auto_tune_test.cu
index f477cd12193..c3918b8ebe5 100644
--- a/paddle/phi/kernels/autotune/auto_tune_test.cu
+++ b/paddle/phi/kernels/autotune/auto_tune_test.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include "glog/logging.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/all_context.h"
@@ -66,8 +67,8 @@ float Algo(const phi::GPUContext& ctx,
                      N);
 #else
   VLOG(3) << "Vecsize is " << Vecsize;
-  VecSumTest<float, Vecsize><<<blocks, threads, 0, ctx.stream()>>>(
-      d_in_data, d_out_data, N);
+  VecSumTest<float, Vecsize>
+      <<<blocks, threads, 0, ctx.stream()>>>(d_in_data, d_out_data, N);
 #endif
   return Vecsize;
 }
diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc
index ef2cbe633d4..5e2c9e1c742 100644
--- a/paddle/phi/kernels/autotune/cache.cc
+++ b/paddle/phi/kernels/autotune/cache.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/autotune/cache.h"
+
 #include <iomanip>
+
 #include "glog/logging.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
index 37c5d134e8a..9d7f57e96e3 100644
--- a/paddle/phi/kernels/autotune/cache.h
+++ b/paddle/phi/kernels/autotune/cache.h
@@ -19,6 +19,7 @@
 #include <numeric>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
diff --git a/paddle/phi/kernels/autotune/cache_test.cc b/paddle/phi/kernels/autotune/cache_test.cc
index f99f8bfc8b8..53574c3d0c9 100644
--- a/paddle/phi/kernels/autotune/cache_test.cc
+++ b/paddle/phi/kernels/autotune/cache_test.cc
@@ -13,9 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/autotune/cache.h"
+
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <functional>
+
 #include "glog/logging.h"
 
 enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 };
diff --git a/paddle/phi/kernels/autotune/gpu_timer_test.cu b/paddle/phi/kernels/autotune/gpu_timer_test.cu
index b6eb345885f..d24508dfa20 100644
--- a/paddle/phi/kernels/autotune/gpu_timer_test.cu
+++ b/paddle/phi/kernels/autotune/gpu_timer_test.cu
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <functional>
+
 #include "glog/logging.h"
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
diff --git a/paddle/phi/kernels/autotune/switch_autotune.h b/paddle/phi/kernels/autotune/switch_autotune.h
index 1793940542d..de638ac4eda 100644
--- a/paddle/phi/kernels/autotune/switch_autotune.h
+++ b/paddle/phi/kernels/autotune/switch_autotune.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <cmath>
+
 #include "paddle/phi/kernels/autotune/cache.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h
index 3de2f69f452..afbb0c78ca9 100644
--- a/paddle/phi/kernels/batch_norm_grad_kernel.h
+++ b/paddle/phi/kernels/batch_norm_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
index 5d24f6684a4..79d5b8a445b 100644
--- a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
+++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h
index 22b5201b690..dccaebcf41f 100644
--- a/paddle/phi/kernels/broadcast_tensors_kernel.h
+++ b/paddle/phi/kernels/broadcast_tensors_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/channel_shuffle_grad_kernel.h b/paddle/phi/kernels/channel_shuffle_grad_kernel.h
index ac89f3336bc..d75d887d0fc 100644
--- a/paddle/phi/kernels/channel_shuffle_grad_kernel.h
+++ b/paddle/phi/kernels/channel_shuffle_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/channel_shuffle_kernel.h b/paddle/phi/kernels/channel_shuffle_kernel.h
index 12de25606dd..c15e06fb552 100644
--- a/paddle/phi/kernels/channel_shuffle_kernel.h
+++ b/paddle/phi/kernels/channel_shuffle_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/conv_kernel.cc b/paddle/phi/kernels/conv_kernel.cc
index 7268384f401..542a4ec8a61 100644
--- a/paddle/phi/kernels/conv_kernel.cc
+++ b/paddle/phi/kernels/conv_kernel.cc
@@ -14,9 +14,8 @@
 
 #include "paddle/phi/kernels/conv_kernel.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
@@ -41,8 +40,8 @@ void ConvInferKernel(const Context& dev_ctx,
                          dilations,
                          data_format,
                          /*use_addto=*/false,
-                         /*workspace_size_MB=*/paddle::platform::
-                             GetDefaultConvWorkspaceSizeLimitMB(),
+                         /*workspace_size_MB=*/
+                         paddle::platform::GetDefaultConvWorkspaceSizeLimitMB(),
                          /*exhaustive_search=*/false,
                          out);
 }
diff --git a/paddle/phi/kernels/conv_transpose_grad_kernel.h b/paddle/phi/kernels/conv_transpose_grad_kernel.h
index 2b1c0c1a934..00d5fb51f01 100644
--- a/paddle/phi/kernels/conv_transpose_grad_kernel.h
+++ b/paddle/phi/kernels/conv_transpose_grad_kernel.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/conv_transpose_kernel.h b/paddle/phi/kernels/conv_transpose_kernel.h
index de56f13ddf7..e39617e0e7c 100644
--- a/paddle/phi/kernels/conv_transpose_kernel.h
+++ b/paddle/phi/kernels/conv_transpose_kernel.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc
index 9f89fc27a71..a10e0eed64a 100644
--- a/paddle/phi/kernels/cpu/abs_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/abs_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/accuracy_kernel.cc b/paddle/phi/kernels/cpu/accuracy_kernel.cc
index 6ff8a1f7558..17246de35db 100644
--- a/paddle/phi/kernels/cpu/accuracy_kernel.cc
+++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/accuracy_kernel.h"
 
 #include <algorithm>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 165627839a3..bd3e16d54dc 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/activation_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
diff --git a/paddle/phi/kernels/cpu/adagrad_kernel.cc b/paddle/phi/kernels/cpu/adagrad_kernel.cc
index fcd89caf7fa..d6867deff4c 100644
--- a/paddle/phi/kernels/cpu/adagrad_kernel.cc
+++ b/paddle/phi/kernels/cpu/adagrad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/adagrad_kernel.h"
+
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc
index f95ddc5621e..c6a512aa95c 100644
--- a/paddle/phi/kernels/cpu/allclose_kernel.cc
+++ b/paddle/phi/kernels/cpu/allclose_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/allclose_kernel.h"
 
 #include <cmath>
+
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/arange_kernel.cc b/paddle/phi/kernels/cpu/arange_kernel.cc
index 478251b0d3b..7f7e5554231 100644
--- a/paddle/phi/kernels/cpu/arange_kernel.cc
+++ b/paddle/phi/kernels/cpu/arange_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/arange_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/range_function.h"
diff --git a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
index 7a519aab0ad..3bc8c853a7b 100644
--- a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/atan2_kernel.cc b/paddle/phi/kernels/cpu/atan2_kernel.cc
index df6f5f59ac0..4cb96ad8b6c 100644
--- a/paddle/phi/kernels/cpu/atan2_kernel.cc
+++ b/paddle/phi/kernels/cpu/atan2_kernel.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
index 366a08e59fe..beda276c8ef 100644
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
-
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/batch_norm_utils.h"
 
diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
index 743128e8dea..cb8af06b540 100644
--- a/paddle/phi/kernels/cpu/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/batch_norm_kernel.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#include "paddle/fluid/framework/tensor_util.h"
-
 namespace phi {
 
 template <typename T>
diff --git a/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
index 6859451e8be..fc91af3ff71 100644
--- a/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/bce_loss_grad_kernel.h"
 
 #include <algorithm>  // for max
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/bce_loss_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_kernel.cc
index 76b97936514..9d62fabcbe7 100644
--- a/paddle/phi/kernels/cpu/bce_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/bce_loss_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/bce_loss_kernel.h"
 
 #include <algorithm>  // for max
+
 #include "paddle/fluid/operators/math.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/bernoulli_kernel.cc b/paddle/phi/kernels/cpu/bernoulli_kernel.cc
index 09c07d9ec9d..6bf548154a4 100644
--- a/paddle/phi/kernels/cpu/bernoulli_kernel.cc
+++ b/paddle/phi/kernels/cpu/bernoulli_kernel.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/bernoulli_kernel.h"
+
 #include <random>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc b/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc
index 2268212316a..ef7e8a981c5 100644
--- a/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h"
-#include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(bilinear_tensor_product_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc b/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc
index 25bc5913865..d8226564182 100644
--- a/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc
+++ b/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/bilinear_tensor_product_kernel.h"
-#include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h"
 
 PD_REGISTER_KERNEL(bilinear_tensor_product,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
index 0869cd62024..413638e1772 100644
--- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
 
 #include <vector>
+
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
index 4cb6db87692..3ad26164d7d 100644
--- a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/broadcast_tensors_kernel.h"
-#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
 
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
 
 PD_REGISTER_KERNEL(broadcast_tensors,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/cast_kernel.cc b/paddle/phi/kernels/cpu/cast_kernel.cc
index 2132f0d5ae8..8abfa173fd0 100644
--- a/paddle/phi/kernels/cpu/cast_kernel.cc
+++ b/paddle/phi/kernels/cpu/cast_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/cast_kernel.h"
-#include "paddle/phi/kernels/cpu/cast_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/cpu/cast_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/channel_shuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/channel_shuffle_grad_kernel.cc
index fcc91b21916..e95b454dbf9 100644
--- a/paddle/phi/kernels/cpu/channel_shuffle_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/channel_shuffle_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/channel_shuffle_grad_kernel.h"
-#include "paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(channel_shuffle_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/channel_shuffle_kernel.cc b/paddle/phi/kernels/cpu/channel_shuffle_kernel.cc
index 95d19ec6a77..0bac82e779c 100644
--- a/paddle/phi/kernels/cpu/channel_shuffle_kernel.cc
+++ b/paddle/phi/kernels/cpu/channel_shuffle_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/channel_shuffle_kernel.h"
-#include "paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h"
 
 PD_REGISTER_KERNEL(channel_shuffle,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
index b6f5dd29ba2..612d10994cb 100644
--- a/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cholesky_solve_grad_kernel.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(cholesky_solve_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
index 02597560a7f..11cb66f88c1 100644
--- a/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
+++ b/paddle/phi/kernels/cpu/cholesky_solve_kernel.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/clip_grad_kernel.cc b/paddle/phi/kernels/cpu/clip_grad_kernel.cc
index bccdc0746d5..89a14af10d1 100644
--- a/paddle/phi/kernels/cpu/clip_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/clip_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/clip_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/clip_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/clip_kernel.cc b/paddle/phi/kernels/cpu/clip_kernel.cc
index 5fd9aea966f..bcbb8527927 100644
--- a/paddle/phi/kernels/cpu/clip_kernel.cc
+++ b/paddle/phi/kernels/cpu/clip_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/clip_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/clip_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/compare_kernel.cc b/paddle/phi/kernels/cpu/compare_kernel.cc
index 9006325a521..694b44c16d8 100644
--- a/paddle/phi/kernels/cpu/compare_kernel.cc
+++ b/paddle/phi/kernels/cpu/compare_kernel.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/compare_kernel.h"
-#include "paddle/phi/kernels/impl/compare_kernel_impl.h"
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/compare_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/complex_grad_kernel.cc b/paddle/phi/kernels/cpu/complex_grad_kernel.cc
index 5c1d50f5bf2..11b7a058346 100644
--- a/paddle/phi/kernels/cpu/complex_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/complex_grad_kernel.h"
-#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
 
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(real_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc
index 859d5a84527..bef0b7b747a 100644
--- a/paddle/phi/kernels/cpu/complex_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/impl/complex_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/complex_kernel_impl.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/complex.h"
diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
index 4538ccf9433..3289c8f5c84 100644
--- a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/conv_grad_grad_kernel.h"
-#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
 
 namespace phi {
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_kernel.cc
index 2d8a9bf1de7..880837dd7cd 100644
--- a/paddle/phi/kernels/cpu/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/conv_grad_kernel.h"
-#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/conv_kernel.cc b/paddle/phi/kernels/cpu/conv_kernel.cc
index e0b4ee7d577..ec325319493 100644
--- a/paddle/phi/kernels/cpu/conv_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/conv_kernel.h"
-#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
 
 namespace phi {
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc
index 8d074950069..17fe44dea3f 100644
--- a/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
-#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/conv_transpose_kernel.cc b/paddle/phi/kernels/cpu/conv_transpose_kernel.cc
index b4cacc85093..ad9a5933f28 100644
--- a/paddle/phi/kernels/cpu/conv_transpose_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_transpose_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/conv_transpose_kernel.h"
-#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/cross_entropy_kernel.cc b/paddle/phi/kernels/cpu/cross_entropy_kernel.cc
index c684fb416ea..bd3eb3eb754 100644
--- a/paddle/phi/kernels/cpu/cross_entropy_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_entropy_kernel.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/cross_entropy_kernel.h"
 
+#include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
@@ -21,8 +22,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/softmax_kernel.h"
 
-#include "paddle/fluid/operators/math/cross_entropy.h"
-
 namespace phi {
 
 template <typename T>
diff --git a/paddle/phi/kernels/cpu/cross_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_grad_kernel.cc
index 390420008e6..8dddc6f6e4e 100644
--- a/paddle/phi/kernels/cpu/cross_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/cross_grad_kernel.h"
-#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(cross_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/cross_kernel.cc b/paddle/phi/kernels/cpu/cross_kernel.cc
index a63f33174ea..1f3a8fe5a38 100644
--- a/paddle/phi/kernels/cpu/cross_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/cross_kernel.h"
-#include "paddle/phi/kernels/impl/cross_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cross_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     cross, CPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/cumprod_kernel.cc b/paddle/phi/kernels/cpu/cumprod_kernel.cc
index aea338027f5..4ecf0929184 100644
--- a/paddle/phi/kernels/cpu/cumprod_kernel.cc
+++ b/paddle/phi/kernels/cpu/cumprod_kernel.cc
@@ -16,6 +16,7 @@
 
 #include <cstdint>
 #include <type_traits>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index f64b1d3291f..a4d43ef8fbe 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -58,10 +58,9 @@ inline void ModulatedDeformableCol2imCPUKernel(
     int w_in = w_out * stride_w - pad_w;
     int h_in = h_out * stride_h - pad_h;
 
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
+    const T* data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const int data_offset_h_ptr =
         ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
     const int data_offset_w_ptr =
@@ -75,9 +74,9 @@ inline void ModulatedDeformableCol2imCPUKernel(
 
     T cur_top_grad = data_col[thread];
     if (data_mask) {
-      const T* data_mask_ptr = data_mask +
-                               (b * deformable_group + deformable_group_index) *
-                                   kernel_h * kernel_w * height_col * width_col;
+      const T* data_mask_ptr =
+          data_mask + (b * deformable_group + deformable_group_index) *
+                          kernel_h * kernel_w * height_col * width_col;
       const T mask = data_mask_ptr[data_mask_hw_ptr];
       cur_top_grad *= mask;
     }
@@ -180,23 +179,20 @@ void ModulatedDeformableCol2imCoordCPUKernel(
     const int deformable_group_index = c / (2 * kernel_h * kernel_w);
     const int col_step = kernel_h * kernel_w;
     int cnt = 0;
-    const T* data_col_ptr = data_col +
-                            deformable_group_index *
-                                channel_per_deformable_group * batch_size *
-                                width_col * height_col;
-    const T* data_im_ptr = data_im +
-                           (b * deformable_group + deformable_group_index) *
-                               channel_per_deformable_group / kernel_h /
-                               kernel_w * height * width;
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
+    const T* data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T* data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T* data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const T* data_mask_ptr =
         data_mask
-            ? data_mask +
-                  (b * deformable_group + deformable_group_index) * kernel_h *
-                      kernel_w * height_col * width_col
+            ? data_mask + (b * deformable_group + deformable_group_index) *
+                              kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
     const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
diff --git a/paddle/phi/kernels/cpu/diag_grad_kernel.cc b/paddle/phi/kernels/cpu/diag_grad_kernel.cc
index c56b225e2a7..616ea753ef1 100644
--- a/paddle/phi/kernels/cpu/diag_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/diag_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/diag_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
index c3c290b4fe9..5671e70c96e 100644
--- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/diagonal_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
diff --git a/paddle/phi/kernels/cpu/diagonal_kernel.cc b/paddle/phi/kernels/cpu/diagonal_kernel.cc
index df17b458e11..8ea5826ba25 100644
--- a/paddle/phi/kernels/cpu/diagonal_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/diagonal_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
diff --git a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
index da1b5ae5566..dc7fcaf6f92 100644
--- a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/digamma_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/cpu/digamma_kernel.cc b/paddle/phi/kernels/cpu/digamma_kernel.cc
index ee120a29b60..80cbda4b7a9 100644
--- a/paddle/phi/kernels/cpu/digamma_kernel.cc
+++ b/paddle/phi/kernels/cpu/digamma_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/digamma_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/cpu/dist_grad_kernel.cc b/paddle/phi/kernels/cpu/dist_grad_kernel.cc
index 2b7f8f98f94..c1aaa2adf75 100644
--- a/paddle/phi/kernels/cpu/dist_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/dist_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/dist_grad_kernel.h"
-#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dist_kernel.cc b/paddle/phi/kernels/cpu/dist_kernel.cc
index ccf3d4be832..0c7b5db64b3 100644
--- a/paddle/phi/kernels/cpu/dist_kernel.cc
+++ b/paddle/phi/kernels/cpu/dist_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/dist_kernel.h"
-#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
 
 PD_REGISTER_KERNEL(dist, CPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dot_grad_kernel.cc b/paddle/phi/kernels/cpu/dot_grad_kernel.cc
index a2abdb7c009..883b7780221 100644
--- a/paddle/phi/kernels/cpu/dot_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/dot_grad_kernel.cc
@@ -13,12 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/dot_grad_kernel.h"
-#include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(dot_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
index b77a6c55b14..db956564218 100644
--- a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/dropout_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc
index fa12e505e42..d9c02eff010 100644
--- a/paddle/phi/kernels/cpu/dropout_kernel.cc
+++ b/paddle/phi/kernels/cpu/dropout_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/dropout_kernel.h"
+
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/eigh_grad_kernel.cc b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
index 5135778db56..db533416d27 100644
--- a/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/eigh_grad_kernel.h"
-#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
 
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(eigh_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/eigh_kernel.cc b/paddle/phi/kernels/cpu/eigh_kernel.cc
index 92fd20ca9b8..0f0a10c8377 100644
--- a/paddle/phi/kernels/cpu/eigh_kernel.cc
+++ b/paddle/phi/kernels/cpu/eigh_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/eigh_kernel.h"
-#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/einsum_kernel.cc b/paddle/phi/kernels/cpu/einsum_kernel.cc
index 8968542b3e0..401d2fd158a 100644
--- a/paddle/phi/kernels/cpu/einsum_kernel.cc
+++ b/paddle/phi/kernels/cpu/einsum_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/einsum_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
diff --git a/paddle/phi/kernels/cpu/elementwise.h b/paddle/phi/kernels/cpu/elementwise.h
index 0f67df66113..255dae7da01 100644
--- a/paddle/phi/kernels/cpu/elementwise.h
+++ b/paddle/phi/kernels/cpu/elementwise.h
@@ -16,10 +16,9 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
-
-#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
index 5019b9f5706..b5e28ab39e5 100644
--- a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
index d380621818b..15fe92c9291 100644
--- a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index 286b0d0ffaa..f090ddd5bbe 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc b/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
index 2424a533010..34915037384 100644
--- a/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
index 0e97852ac33..a013309233d 100644
--- a/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/embedding_grad_kernel.cc b/paddle/phi/kernels/cpu/embedding_grad_kernel.cc
index 21b3e6da8d9..fabb4e83d52 100644
--- a/paddle/phi/kernels/cpu/embedding_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/embedding_grad_kernel.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/embedding_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/embedding_kernel.cc b/paddle/phi/kernels/cpu/embedding_kernel.cc
index 76cc3814b05..0430f7a0052 100644
--- a/paddle/phi/kernels/cpu/embedding_kernel.cc
+++ b/paddle/phi/kernels/cpu/embedding_kernel.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/embedding_kernel.h"
-#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/erf_grad_kernel.cc b/paddle/phi/kernels/cpu/erf_grad_kernel.cc
index 3c1cd0df153..ae0b218bc0b 100644
--- a/paddle/phi/kernels/cpu/erf_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/erf_grad_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/erf_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/erf_kernel.cc b/paddle/phi/kernels/cpu/erf_kernel.cc
index 05ce4cab7fc..ace9775c0b8 100644
--- a/paddle/phi/kernels/cpu/erf_kernel.cc
+++ b/paddle/phi/kernels/cpu/erf_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/erf_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc
index b1fe4f026ab..2d363189936 100644
--- a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/erfinv_grad_kernel.h"
-#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     erfinv_grad, CPU, ALL_LAYOUT, phi::ErfinvGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/erfinv_kernel.cc b/paddle/phi/kernels/cpu/erfinv_kernel.cc
index 4f3a740f9d9..f298cc358d6 100644
--- a/paddle/phi/kernels/cpu/erfinv_kernel.cc
+++ b/paddle/phi/kernels/cpu/erfinv_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/erfinv_kernel.h"
-#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
 
 PD_REGISTER_KERNEL(erfinv, CPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
index 6eafe9aa49d..c57e3a87281 100644
--- a/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_as_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/expand_as_grad_kernel.h"
-#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(expand_as_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/expand_as_kernel.cc b/paddle/phi/kernels/cpu/expand_as_kernel.cc
index 697ea138097..4ec28ef8413 100644
--- a/paddle/phi/kernels/cpu/expand_as_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_as_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/expand_as_kernel.h"
-#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
 
 PD_REGISTER_KERNEL(expand_as,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/expand_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_grad_kernel.cc
index 4799a6aa7af..5cbbf253b74 100644
--- a/paddle/phi/kernels/cpu/expand_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/expand_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/cpu/expand_kernel.cc b/paddle/phi/kernels/cpu/expand_kernel.cc
index 07704897672..2df833d0f9c 100644
--- a/paddle/phi/kernels/cpu/expand_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/expand_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/cpu/eye_kernel.cc b/paddle/phi/kernels/cpu/eye_kernel.cc
index a0d0f2c4390..ef3489d3fae 100644
--- a/paddle/phi/kernels/cpu/eye_kernel.cc
+++ b/paddle/phi/kernels/cpu/eye_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/eye_kernel.h"
-#include "paddle/phi/kernels/impl/eye_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/eye_kernel_impl.h"
 
 PD_REGISTER_KERNEL(eye,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc
index 338be9e252d..5434296be4d 100644
--- a/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
-#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(frobenius_norm_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc b/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc
index 77509b953bf..56444ddad8d 100644
--- a/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/frobenius_norm_kernel.h"
-#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     frobenius_norm, CPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index 0b76425a659..ceb2312b53a 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
diff --git a/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
index b375a7ec469..88a288afd31 100644
--- a/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/gather_nd_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gather_nd_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
diff --git a/paddle/phi/kernels/cpu/gather_nd_kernel.cc b/paddle/phi/kernels/cpu/gather_nd_kernel.cc
index aa32d036934..8ae866a1c8a 100644
--- a/paddle/phi/kernels/cpu/gather_nd_kernel.cc
+++ b/paddle/phi/kernels/cpu/gather_nd_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gather_nd_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/gather.h"
diff --git a/paddle/phi/kernels/cpu/gather_tree_kernel.cc b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
index 25fb870d851..6f3cac6c4aa 100644
--- a/paddle/phi/kernels/cpu/gather_tree_kernel.cc
+++ b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gather_tree_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
index 348d24b534e..c600149cbba 100644
--- a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
@@ -14,11 +14,10 @@
 
 #include "paddle/phi/kernels/gaussian_random_kernel.h"
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#include "paddle/fluid/framework/generator.h"
-
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc
index d7af2205745..4d23470aa4e 100644
--- a/paddle/phi/kernels/cpu/gelu_kernel.cc
+++ b/paddle/phi/kernels/cpu/gelu_kernel.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gelu_kernel.h"
+
 #include <algorithm>
 #include <cmath>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/cpu/graph_reindex_kernel.cc b/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
index 92f2dc41e65..428bcb03170 100644
--- a/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/graph_reindex_kernel.h"
+
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/phi/kernels/graph_reindex_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
index 70aac053417..1ef5373d631 100644
--- a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <vector>
-
 #include "paddle/phi/kernels/graph_sample_neighbors_kernel.h"
 
+#include <vector>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
index 6ea65d005c1..ad04bd258e1 100644
--- a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
-#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
 
 #include <algorithm>
 #include <vector>
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
index 8f71ba12cc4..e4034230c78 100644
--- a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/graph_send_recv_kernel.h"
-#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
 
 #include <algorithm>
 #include <set>
@@ -22,6 +21,7 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
index 923cb842411..32fa0d5aafe 100644
--- a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
@@ -73,8 +73,9 @@ static inline void ClipWithMask(const CPUContext& ctx,
                          .cwiseMin(static_cast<T>(max_val));
       auto in_bound = (clipped == reflected).template cast<T>();
       grid_scale_t.device(place) =
-          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
-                          (is_neg != one_more_flip).template cast<T>()) *
+          grid_scale_t *
+          ((is_neg == one_more_flip).template cast<T>() -
+           (is_neg != one_more_flip).template cast<T>()) *
           in_bound;
       grid_slice_t.device(place) = clipped;
     }
diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc
index a4c131e72b5..832df98e0f3 100644
--- a/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gumbel_softmax_grad_kernel.h"
-#include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(gumbel_softmax_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
index eb406665c5f..7638ca3aa7e 100644
--- a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/gumbel_softmax_kernel.h"
-#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/histogram_kernel.cc b/paddle/phi/kernels/cpu/histogram_kernel.cc
index 82b88f868d8..d9c41508efd 100644
--- a/paddle/phi/kernels/cpu/histogram_kernel.cc
+++ b/paddle/phi/kernels/cpu/histogram_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/histogram_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
index 654f2c9400a..b52a587070a 100644
--- a/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/huber_loss_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/huber_loss_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_kernel.cc
index 702c0589057..2c4d8941ab8 100644
--- a/paddle/phi/kernels/cpu/huber_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/huber_loss_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/huber_loss_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/huber_loss_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
index d060e8c9b28..fe8ca4e432e 100644
--- a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/index_sample_grad_kernel.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc
index b895e4aa7c0..faa6953704e 100644
--- a/paddle/phi/kernels/cpu/index_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/index_sample_kernel.h"
+
 #include <cmath>
 #include <fstream>
 #include <set>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
diff --git a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
index 867d43fd833..45ef0034109 100644
--- a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -142,12 +143,11 @@ void InstanceNormGradKernel(const Context& dev_ctx,
   dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
                           inv_var_arr.broadcast(bcast) *
                           (dy_arr - dy_mean -
-                           tmp *
-                               (dy_arr * tmp)
-                                   .mean(mean_rdims)
-                                   .reshape(NxC_shape)
-                                   .eval()
-                                   .broadcast(bcast));
+                           tmp * (dy_arr * tmp)
+                                     .mean(mean_rdims)
+                                     .reshape(NxC_shape)
+                                     .eval()
+                                     .broadcast(bcast));
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/instance_norm_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
index 5eac473effa..4deced5499e 100644
--- a/paddle/phi/kernels/cpu/instance_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
index d4e13aa3b24..edd41b2c7a3 100644
--- a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/interpolate_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/isclose_kernel.cc b/paddle/phi/kernels/cpu/isclose_kernel.cc
index 633c6ba093e..dca21494b3e 100644
--- a/paddle/phi/kernels/cpu/isclose_kernel.cc
+++ b/paddle/phi/kernels/cpu/isclose_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/isclose_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
index f9399d38d71..9f6e2573e33 100644
--- a/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
index c462b8ec32c..ecb1915cf42 100644
--- a/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/kldiv_loss_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc
index 74664fb270b..1a900b4bc2a 100644
--- a/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/label_smooth_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/cpu/label_smooth_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
index af9548e8186..cdeed73310d 100644
--- a/paddle/phi/kernels/cpu/label_smooth_kernel.cc
+++ b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/label_smooth_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
index a30f54fd4b6..081a32b4f24 100644
--- a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/funcs/layer_norm_util.h"
 #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
index 52722468e16..dbc3da0ca15 100644
--- a/paddle/phi/kernels/cpu/layer_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/layer_norm_kernel.h"
+
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/funcs/layer_norm_util.h"
 #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
diff --git a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
index d74919011ec..ae98cb9d03a 100644
--- a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/lerp_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lerp_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/lerp_kernel.cc b/paddle/phi/kernels/cpu/lerp_kernel.cc
index 7adfc35bfa3..d02e706d8d6 100644
--- a/paddle/phi/kernels/cpu/lerp_kernel.cc
+++ b/paddle/phi/kernels/cpu/lerp_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/lerp_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lerp_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
index 116fa3f8d3f..a87c01214a9 100644
--- a/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/lgamma_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/lgamma_kernel.cc b/paddle/phi/kernels/cpu/lgamma_kernel.cc
index f849322174d..4979ad0b30b 100644
--- a/paddle/phi/kernels/cpu/lgamma_kernel.cc
+++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/lgamma_kernel.h"
 
 #include <unsupported/Eigen/SpecialFunctions>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
index 5f344b9cc3f..d3e5e90fd17 100644
--- a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
@@ -55,10 +55,9 @@ struct LogSoftmaxGradFunctor {
     Eigen::DSizes<int, 2> one_axis(1, axis_dim);
 
     dx.device(*context.eigen_device()) =
-        dy -
-        (y.exp()) * (dy.reshape(batch_axis_remain)
-                         .sum(along_class)
-                         .broadcast(one_axis));
+        dy - (y.exp()) * (dy.reshape(batch_axis_remain)
+                              .sum(along_class)
+                              .broadcast(one_axis));
   }
 };
 
diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
index 241742378cc..510eb7a6ca9 100644
--- a/paddle/phi/kernels/cpu/log_softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
@@ -72,34 +72,31 @@ struct LogSoftmaxFunctor {
       // axis == -1, axis and class in same dimension, calculate along
       // class dimension directly for higher performance
       log_softmax.device(*context.eigen_device()) =
-          (logits -
-           logits.maximum(along_axis)
-               .eval()
-               .reshape(batch_by_one)
-               .broadcast(one_by_class))
+          (logits - logits.maximum(along_axis)
+                        .eval()
+                        .reshape(batch_by_one)
+                        .broadcast(one_by_class))
               .unaryExpr(ValueClip<T>());
     } else {
       // axis != -1, class dimension split into (axis, remain), max and sum
       // should be calculated along axis dimension
       log_softmax.device(*context.eigen_device()) =
-          (logits.reshape(batch_axis_remain) -
-           logits.reshape(batch_axis_remain)
-               .maximum(along_axis)
-               .eval()
-               .reshape(batch_one_remain)
-               .broadcast(one_axis_one)
-               .reshape(batch_classes))
+          (logits.reshape(batch_axis_remain) - logits.reshape(batch_axis_remain)
+                                                   .maximum(along_axis)
+                                                   .eval()
+                                                   .reshape(batch_one_remain)
+                                                   .broadcast(one_axis_one)
+                                                   .reshape(batch_classes))
               .unaryExpr(ValueClip<T>());
     }
 
     log_softmax.device(*context.eigen_device()) =
-        log_softmax -
-        log_softmax.exp()
-            .eval()
-            .reshape(batch_axis_remain)
-            .sum(along_axis)
-            .log()
-            .broadcast(one_axis);
+        log_softmax - log_softmax.exp()
+                          .eval()
+                          .reshape(batch_axis_remain)
+                          .sum(along_axis)
+                          .log()
+                          .broadcast(one_axis);
   }
 };
 
diff --git a/paddle/phi/kernels/cpu/logsumexp_kernel.cc b/paddle/phi/kernels/cpu/logsumexp_kernel.cc
index 06e0b30a9ca..f1fecdfbe9e 100644
--- a/paddle/phi/kernels/cpu/logsumexp_kernel.cc
+++ b/paddle/phi/kernels/cpu/logsumexp_kernel.cc
@@ -16,7 +16,6 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/impl/logsumexp_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
index aba519ff048..e3cd8fff8a5 100644
--- a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(matmul_grad,
diff --git a/paddle/phi/kernels/cpu/matmul_kernel.cc b/paddle/phi/kernels/cpu/matmul_kernel.cc
index 8aa25c0da07..c75a50130db 100644
--- a/paddle/phi/kernels/cpu/matmul_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_kernel.cc
@@ -15,9 +15,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/matmul_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
 PD_REGISTER_KERNEL(matmul,
diff --git a/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
index ae3b4d2b455..0f60f8da71a 100644
--- a/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/matrix_power_grad_kernel.h"
-#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(matrix_power_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/matrix_power_kernel.cc b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
index f40e1e616f5..08ee7cbc865 100644
--- a/paddle/phi/kernels/cpu/matrix_power_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/matrix_power_kernel.h"
-#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     matrix_power, CPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc
index 5e13abe8aed..f56bd3d6dbe 100644
--- a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/matrix_rank_kernel.h"
-#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
index 3bfc07319e9..af9b7728389 100644
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -16,6 +16,7 @@
 
 #include <Eigen/Dense>
 #include <Eigen/SVD>
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
diff --git a/paddle/phi/kernels/cpu/maxout_grad_kernel.cc b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
index 429344a362b..dad4e96b5a8 100644
--- a/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     maxout_grad, CPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/maxout_kernel.cc b/paddle/phi/kernels/cpu/maxout_kernel.cc
index e7cd3ab07ff..cc1d21d310b 100644
--- a/paddle/phi/kernels/cpu/maxout_kernel.cc
+++ b/paddle/phi/kernels/cpu/maxout_kernel.cc
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
 
 PD_REGISTER_KERNEL(maxout, CPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
index 159d1092553..5b43fb02b51 100644
--- a/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/meshgrid_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/meshgrid_grad_kernel.h"
-#include "paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(meshgrid_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/meshgrid_kernel.cc b/paddle/phi/kernels/cpu/meshgrid_kernel.cc
index c201103b3da..35e43f7bbc8 100644
--- a/paddle/phi/kernels/cpu/meshgrid_kernel.cc
+++ b/paddle/phi/kernels/cpu/meshgrid_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/meshgrid_kernel.h"
-#include "paddle/phi/kernels/impl/meshgrid_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/meshgrid_kernel_impl.h"
 
 PD_REGISTER_KERNEL(meshgrid,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/momentum_kernel.cc b/paddle/phi/kernels/cpu/momentum_kernel.cc
index 63cc5592ef4..7a4ea9f19e5 100644
--- a/paddle/phi/kernels/cpu/momentum_kernel.cc
+++ b/paddle/phi/kernels/cpu/momentum_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/momentum_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
index 2cd75404be8..f6b07584ce4 100644
--- a/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/multi_dot_grad_kernel.h"
-#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     multi_dot_grad, CPU, ALL_LAYOUT, phi::MultiDotGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/multi_dot_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_kernel.cc
index a4249a98e46..00cf425a038 100644
--- a/paddle/phi/kernels/cpu/multi_dot_kernel.cc
+++ b/paddle/phi/kernels/cpu/multi_dot_kernel.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/multi_dot_kernel.h"
-#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     multi_dot, CPU, ALL_LAYOUT, phi::MultiDotKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
index f5a426e93db..12ba6dadde3 100644
--- a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
@@ -15,7 +15,6 @@
 #include "paddle/phi/kernels/multiplex_grad_kernel.h"
 
 #include "paddle/fluid/memory/memcpy.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/cpu/mv_kernel.cc b/paddle/phi/kernels/cpu/mv_kernel.cc
index 7f76ddda6dd..408eda34e1c 100644
--- a/paddle/phi/kernels/cpu/mv_kernel.cc
+++ b/paddle/phi/kernels/cpu/mv_kernel.cc
@@ -16,7 +16,6 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/impl/mv_kernel_impl.h"
 
 PD_REGISTER_KERNEL(mv, CPU, ALL_LAYOUT, phi::MvKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
index dd2b09ee39a..9048e87d049 100644
--- a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <string>
+
 #include "paddle/fluid/operators/math.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/nll_loss_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
index 92cb6a1ad17..c966e91a9a6 100644
--- a/paddle/phi/kernels/cpu/nll_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/nll_loss_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/norm_grad_kernel.cc b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
index bd05e2c4c6e..92ca51b499c 100644
--- a/paddle/phi/kernels/cpu/norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
@@ -13,15 +13,13 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/norm_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#include "paddle/phi/kernels/funcs/eigen/common.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/norm_kernel.cc b/paddle/phi/kernels/cpu/norm_kernel.cc
index 50906d9c3bb..f69d03b66b1 100644
--- a/paddle/phi/kernels/cpu/norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/norm_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
diff --git a/paddle/phi/kernels/cpu/one_hot_kernel.cc b/paddle/phi/kernels/cpu/one_hot_kernel.cc
index fc7979e41d9..f408c9f0361 100644
--- a/paddle/phi/kernels/cpu/one_hot_kernel.cc
+++ b/paddle/phi/kernels/cpu/one_hot_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/one_hot_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc
index 44ab0504086..32905ab0878 100644
--- a/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/p_norm_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/cpu/p_norm_kernel.cc b/paddle/phi/kernels/cpu/p_norm_kernel.cc
index 9da7fdbb297..597939953b2 100644
--- a/paddle/phi/kernels/cpu/p_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/p_norm_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/p_norm_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
diff --git a/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc
index b32065d4f0a..0e2bfd04b62 100644
--- a/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h"
-#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(pixel_shuffle_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc b/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc
index 80f8fa7b50e..44dcb8b59f7 100644
--- a/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc
+++ b/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/pixel_shuffle_kernel.h"
-#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     pixel_shuffle, CPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc
index ef61fca3595..cbcbf1e129d 100644
--- a/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/pixel_unshuffle_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/pixel_unshuffle_grad_kernel.h"
-#include "paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(pixel_unshuffle_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc b/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc
index 9f4bc747f32..837378972c6 100644
--- a/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc
+++ b/paddle/phi/kernels/cpu/pixel_unshuffle_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/pixel_unshuffle_kernel.h"
-#include "paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h"
 
 PD_REGISTER_KERNEL(pixel_unshuffle,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/poisson_kernel.cc b/paddle/phi/kernels/cpu/poisson_kernel.cc
index 6a3e32c2f07..8ba1afe229e 100644
--- a/paddle/phi/kernels/cpu/poisson_kernel.cc
+++ b/paddle/phi/kernels/cpu/poisson_kernel.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/poisson_kernel.h"
+
 #include <random>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/poisson_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/pool_grad_kernel.cc b/paddle/phi/kernels/cpu/pool_grad_kernel.cc
index bb97694d8fc..68cd57c5227 100644
--- a/paddle/phi/kernels/cpu/pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/pool_grad_kernel.cc
@@ -14,9 +14,8 @@
 
 #include "paddle/phi/kernels/pool_grad_kernel.h"
 
-#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     pool2d_grad, CPU, ALL_LAYOUT, phi::Pool2dGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/pool_kernel.cc b/paddle/phi/kernels/cpu/pool_kernel.cc
index 1d57e282c3c..3d3880692c0 100644
--- a/paddle/phi/kernels/cpu/pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/pool_kernel.cc
@@ -14,9 +14,8 @@
 
 #include "paddle/phi/kernels/pool_kernel.h"
 
-#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
 
 PD_REGISTER_KERNEL(pool2d, CPU, ALL_LAYOUT, phi::Pool2dKernel, float, double) {}
 PD_REGISTER_KERNEL(max_pool2d_with_index,
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
index b68c3ad545d..202baddd713 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/psroi_pool_grad_kernel.h"
 
 #include <algorithm>
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
index 4f7925ad00f..82eff70b756 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/psroi_pool_kernel.h"
 
 #include <algorithm>
+
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/qr_kernel.cc b/paddle/phi/kernels/cpu/qr_kernel.cc
index b0e82cedb6b..6a5551d9557 100644
--- a/paddle/phi/kernels/cpu/qr_kernel.cc
+++ b/paddle/phi/kernels/cpu/qr_kernel.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Dense>
-
 #include "paddle/phi/kernels/qr_kernel.h"
 
+#include <Eigen/Dense>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h
index 35395dccca1..dad288cff2c 100644
--- a/paddle/phi/kernels/cpu/reduce.h
+++ b/paddle/phi/kernels/cpu/reduce.h
@@ -17,10 +17,9 @@
 #include <set>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/cast_kernel.h"
-
-#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 // See Note [ Why still include the fluid headers? ]
diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
index 66ae5e02ffc..abc18b1c578 100644
--- a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
@@ -111,4 +111,3 @@ PD_REGISTER_KERNEL(sum_grad,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-
diff --git a/paddle/phi/kernels/cpu/rmsprop_kernel.cc b/paddle/phi/kernels/cpu/rmsprop_kernel.cc
index fa1e1a2eed3..1d60823d759 100644
--- a/paddle/phi/kernels/cpu/rmsprop_kernel.cc
+++ b/paddle/phi/kernels/cpu/rmsprop_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/rmsprop_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/rnn_functor.h b/paddle/phi/kernels/cpu/rnn_functor.h
index ab6f98ffcd5..911814647d6 100644
--- a/paddle/phi/kernels/cpu/rnn_functor.h
+++ b/paddle/phi/kernels/cpu/rnn_functor.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -21,9 +23,6 @@
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/operators/utils.h"
-
 namespace phi {
 
 #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR)       \
@@ -252,9 +251,12 @@ inline std::vector<DenseTensor> Unbind(const DenseTensor& in) {
 }
 
 template <typename CellType,
-          template <typename, typename> class LayerT,
-          template <typename, typename> class SingleLayerT,
-          template <typename, typename> class BidirLayerT,
+          template <typename, typename>
+          class LayerT,
+          template <typename, typename>
+          class SingleLayerT,
+          template <typename, typename>
+          class BidirLayerT,
           typename T,
           typename Context>
 void RnnFunc(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
index 4dd1894320a..1cd4add7d50 100644
--- a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
@@ -16,7 +16,6 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/cpu/rnn_functor.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
@@ -962,8 +961,10 @@ void dropout_cpu_grad_function_inplace(const CPUContext& dev_ctx,
 }
 
 template <typename GradCellType,
-          template <typename, typename> class SingleGradLayerT,
-          template <typename, typename> class BidirGradLayerT,
+          template <typename, typename>
+          class SingleGradLayerT,
+          template <typename, typename>
+          class BidirGradLayerT,
           typename T>
 void RnnGradFunc(const CPUContext& dev_ctx,
                  const DenseTensor& x,
diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc
index 80c521918ed..e2e784b2943 100644
--- a/paddle/phi/kernels/cpu/rnn_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_kernel.cc
@@ -49,7 +49,8 @@ struct Cell {
 };
 
 template <typename T,
-          template <typename> class EigenActivationFunctor,
+          template <typename>
+          class EigenActivationFunctor,
           funcs::detail::ActivationType act_type>
 struct SimpleRNNCell : Cell<T> {
   void operator()(const CPUContext* dev_ctx,
diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc
index cd779b72e7a..cf0dc47f47b 100644
--- a/paddle/phi/kernels/cpu/roi_align_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc
@@ -79,16 +79,12 @@ std::vector<OffsetsAndRatios<T>> GetIndexesAndRatios(
     for (std::size_t px = 0; px < pooled_width; px++) {
       for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) {
         // calculate x of sample points
-        auto y =
-            roi_ymin +
-            bin_h * (py +
-                     static_cast<T>(iy + .5f) / static_cast<T>(roi_bin_grid_h));
+        auto y = roi_ymin + bin_h * (py + static_cast<T>(iy + .5f) /
+                                              static_cast<T>(roi_bin_grid_h));
         for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) {
           // calculate x of sample points
-          auto x = roi_xmin +
-                   bin_w * (px +
-                            static_cast<T>(ix + .5f) /
-                                static_cast<T>(roi_bin_grid_w));
+          auto x = roi_xmin + bin_w * (px + static_cast<T>(ix + .5f) /
+                                                static_cast<T>(roi_bin_grid_w));
 
           // deal with elements out of map
           if (y < -1.0 || y > height || x < -1.0 || x > width) {
diff --git a/paddle/phi/kernels/cpu/scatter_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
index 62fd58704c4..f09015f24a1 100644
--- a/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/scatter_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/scatter_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/cpu/scatter_kernel.cc b/paddle/phi/kernels/cpu/scatter_kernel.cc
index d48ceaf29a0..7032c3bb5a3 100644
--- a/paddle/phi/kernels/cpu/scatter_kernel.cc
+++ b/paddle/phi/kernels/cpu/scatter_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/scatter_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
index cc143ba8d0e..7c3665c5d2e 100644
--- a/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/scatter_nd_add_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/scatter_nd_add_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
index 04ae10f5e8b..31e2f4c7161 100644
--- a/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/scatter_nd_add_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/scatter_nd_add_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
index a5c9dc4c55e..744ec7805fa 100644
--- a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/segment_pool_grad_kernel.h"
-#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(segment_pool_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
index ad76a7a86bc..541ccd34365 100644
--- a/paddle/phi/kernels/cpu/segment_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/segment_pool_kernel.h"
-#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
 
 PD_REGISTER_KERNEL(segment_pool,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/selu_grad_kernel.cc b/paddle/phi/kernels/cpu/selu_grad_kernel.cc
index 32101b19132..9f83e39a363 100644
--- a/paddle/phi/kernels/cpu/selu_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/selu_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/selu_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/sgd_kernel.cc b/paddle/phi/kernels/cpu/sgd_kernel.cc
index 214fd82bef3..055c44d38e4 100644
--- a/paddle/phi/kernels/cpu/sgd_kernel.cc
+++ b/paddle/phi/kernels/cpu/sgd_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/sgd_kernel.h"
+
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/sign_kernel.cc b/paddle/phi/kernels/cpu/sign_kernel.cc
index 5fe11ffbd6d..9ded252c5c5 100644
--- a/paddle/phi/kernels/cpu/sign_kernel.cc
+++ b/paddle/phi/kernels/cpu/sign_kernel.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sign_kernel.h"
-#include "paddle/phi/kernels/impl/sign_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/sign_kernel_impl.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/bfloat16.h"
diff --git a/paddle/phi/kernels/cpu/size_kernel.cc b/paddle/phi/kernels/cpu/size_kernel.cc
index 71ebf9cdc09..ca8373b8488 100644
--- a/paddle/phi/kernels/cpu/size_kernel.cc
+++ b/paddle/phi/kernels/cpu/size_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/size_kernel.h"
-#include "paddle/phi/kernels/impl/size_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/size_kernel_impl.h"
 
 PD_REGISTER_KERNEL(size,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/slice_grad_kernel.cc b/paddle/phi/kernels/cpu/slice_grad_kernel.cc
index 5c2cb3ea80e..7e3efd21751 100644
--- a/paddle/phi/kernels/cpu/slice_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/slice_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/slice_grad_kernel.h"
-#include "paddle/phi/kernels/impl/slice_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/slice_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(slice_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/slice_kernel.cc b/paddle/phi/kernels/cpu/slice_kernel.cc
index 736540609dd..0f2fe98a853 100644
--- a/paddle/phi/kernels/cpu/slice_kernel.cc
+++ b/paddle/phi/kernels/cpu/slice_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/slice_kernel.h"
-#include "paddle/phi/kernels/impl/slice_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/slice_kernel_impl.h"
 
 PD_REGISTER_KERNEL(slice,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc b/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
index d78477073ad..d296aba6650 100644
--- a/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/sparse_weight_embedding_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc b/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc
index c0f95d03888..cfdccb5c8d9 100644
--- a/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc
+++ b/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/embedding_kernel.h"
-#include "paddle/phi/kernels/funcs/embedding_util.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/embedding_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index 56d87292249..288cdd235ae 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -17,7 +17,6 @@
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
index 400f7e87839..2aff1568197 100644
--- a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
index 6721117992d..29be4871319 100644
--- a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/temporal_shift_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
index 9dbcf575f33..dee69222e6d 100644
--- a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/transpose_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc
index 14aca258a2c..660254fef86 100644
--- a/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(tril_triu_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/tril_triu_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_kernel.cc
index a3d20e55e21..f3599bb92b9 100644
--- a/paddle/phi/kernels/cpu/tril_triu_kernel.cc
+++ b/paddle/phi/kernels/cpu/tril_triu_kernel.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
 
 PD_REGISTER_KERNEL(tril_triu,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
index 4d85dd609e2..24fc3892562 100644
--- a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/trunc_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/trunc_kernel.cc b/paddle/phi/kernels/cpu/trunc_kernel.cc
index babae6ce7c9..5fe33ec6a4b 100644
--- a/paddle/phi/kernels/cpu/trunc_kernel.cc
+++ b/paddle/phi/kernels/cpu/trunc_kernel.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/trunc_kernel.h"
+
 #include <math.h>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/trunc_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/unfold_grad_kernel.cc b/paddle/phi/kernels/cpu/unfold_grad_kernel.cc
index c97005dd845..6ba4ba49b9a 100644
--- a/paddle/phi/kernels/cpu/unfold_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/unfold_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/unfold_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unfold_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/unfold_kernel.cc b/paddle/phi/kernels/cpu/unfold_kernel.cc
index e38d8acd098..f15201542e6 100644
--- a/paddle/phi/kernels/cpu/unfold_kernel.cc
+++ b/paddle/phi/kernels/cpu/unfold_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/unfold_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unfold_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
index c95a8f4ded6..a09812363f1 100644
--- a/paddle/phi/kernels/cpu/uniform_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/uniform_random_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/unique_kernel.cc b/paddle/phi/kernels/cpu/unique_kernel.cc
index 853b401315d..834f05f73e2 100644
--- a/paddle/phi/kernels/cpu/unique_kernel.cc
+++ b/paddle/phi/kernels/cpu/unique_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/unique_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
diff --git a/paddle/phi/kernels/cpu/unstack_grad_kernel.cc b/paddle/phi/kernels/cpu/unstack_grad_kernel.cc
index 9c2dce808dc..c494cbc965e 100644
--- a/paddle/phi/kernels/cpu/unstack_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/unstack_grad_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/unstack_grad_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/unstack_kernel.cc b/paddle/phi/kernels/cpu/unstack_kernel.cc
index 3d233e9ec40..4bc8d1b2c93 100644
--- a/paddle/phi/kernels/cpu/unstack_kernel.cc
+++ b/paddle/phi/kernels/cpu/unstack_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/unstack_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
diff --git a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
index fab49f54160..c98a098aa0e 100644
--- a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
+++ b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
@@ -109,7 +109,8 @@ struct Gather {
 };
 
 template <typename Context,
-          template <typename InT, typename OutT> typename CompareFunctor,
+          template <typename InT, typename OutT>
+          typename CompareFunctor,
           typename T>
 struct GetMask {
   void operator()(const Context& dev_ctx,
@@ -122,7 +123,8 @@ struct GetMask {
 };
 
 template <typename Context,
-          template <typename T> typename BinaryFunctor,
+          template <typename T>
+          typename BinaryFunctor,
           typename T>
 struct BinaryOperation {
   void operator()(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/cpu/warpctc_grad_kernel.cc b/paddle/phi/kernels/cpu/warpctc_grad_kernel.cc
index 0b293363354..7d70d825250 100644
--- a/paddle/phi/kernels/cpu/warpctc_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/warpctc_grad_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/warpctc_grad_kernel.h"
-#include "paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     warpctc_grad, CPU, ALL_LAYOUT, phi::WarpctcGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/warpctc_kernel.cc b/paddle/phi/kernels/cpu/warpctc_kernel.cc
index 4b87202c11e..239c6cb0cbe 100644
--- a/paddle/phi/kernels/cpu/warpctc_kernel.cc
+++ b/paddle/phi/kernels/cpu/warpctc_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/warpctc_kernel.h"
-#include "paddle/phi/kernels/impl/warpctc_kernel_impl.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/warpctc_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     warpctc, CPU, ALL_LAYOUT, phi::WarpctcKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/yolo_box_kernel.cc b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
index a83bc019fc3..6b882ad2895 100644
--- a/paddle/phi/kernels/cpu/yolo_box_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/yolo_box_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/yolo_box_util.h"
diff --git a/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc
index 383009229f9..655106e9cb4 100644
--- a/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/yolov3_loss_grad_kernel.h"
+
 #include <algorithm>
 #include <vector>
 
-#include "paddle/phi/kernels/yolov3_loss_grad_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/yolov3_loss_functor.h"
diff --git a/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc b/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc
index 8a190ab25a7..75b2e3c5c4a 100644
--- a/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/yolov3_loss_kernel.h"
+
 #include <algorithm>
 #include <vector>
 
-#include "paddle/phi/kernels/yolov3_loss_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/yolov3_loss_functor.h"
diff --git a/paddle/phi/kernels/cumprod_grad_kernel.h b/paddle/phi/kernels/cumprod_grad_kernel.h
index b3cb17b28e0..7610cad31e3 100644
--- a/paddle/phi/kernels/cumprod_grad_kernel.h
+++ b/paddle/phi/kernels/cumprod_grad_kernel.h
@@ -25,4 +25,4 @@ void CumprodGradKernel(const Context& dev_ctx,
                        const DenseTensor& dout,
                        int dim,
                        DenseTensor* dx);
-}  // phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/cumprod_kernel.h b/paddle/phi/kernels/cumprod_kernel.h
index 96d76cb0f43..bb8b1427b30 100644
--- a/paddle/phi/kernels/cumprod_kernel.h
+++ b/paddle/phi/kernels/cumprod_kernel.h
@@ -23,4 +23,4 @@ void CumprodKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    int dim,
                    DenseTensor* out);
-}  // phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/diagonal_kernel.h b/paddle/phi/kernels/diagonal_kernel.h
index 7cf7282307a..10afd7dbe92 100644
--- a/paddle/phi/kernels/diagonal_kernel.h
+++ b/paddle/phi/kernels/diagonal_kernel.h
@@ -25,4 +25,4 @@ void DiagonalKernel(const Context& dev_ctx,
                     int axis1,
                     int axis2,
                     DenseTensor* out);
-}  // phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/digamma_grad_kernel.h b/paddle/phi/kernels/digamma_grad_kernel.h
index ae5346080d3..abd8634518d 100644
--- a/paddle/phi/kernels/digamma_grad_kernel.h
+++ b/paddle/phi/kernels/digamma_grad_kernel.h
@@ -24,4 +24,4 @@ void DigammaGradKernel(const Context& ctx,
                        const DenseTensor& out_grad,
                        DenseTensor* x_grad);
 
-}  // namepsace phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/digamma_kernel.h b/paddle/phi/kernels/digamma_kernel.h
index ce25f2e148e..3cf1eae67cc 100644
--- a/paddle/phi/kernels/digamma_kernel.h
+++ b/paddle/phi/kernels/digamma_kernel.h
@@ -21,4 +21,4 @@ namespace phi {
 template <typename T, typename Context>
 void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
 
-}  // namepsace phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 06d258a8a4e..d8cf0bd2ef9 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -14,9 +14,8 @@
 #include "paddle/phi/kernels/empty_kernel.h"
 
 #include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/expand_kernel.h b/paddle/phi/kernels/expand_kernel.h
index 3b44c46e4dd..930240db6cc 100644
--- a/paddle/phi/kernels/expand_kernel.h
+++ b/paddle/phi/kernels/expand_kernel.h
@@ -26,4 +26,4 @@ void ExpandKernel(const Context& ctx,
                   const IntArray& shape,
                   DenseTensor* out);
 
-}  // namepsace phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 83f96c1f9f5..54279fca6e4 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/flatten_grad_kernel.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index f304e7706ad..dd000896073 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/flatten_kernel.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/phi/kernels/frobenius_norm_grad_kernel.h b/paddle/phi/kernels/frobenius_norm_grad_kernel.h
index cfe8192d1a6..65db8dd9e0a 100644
--- a/paddle/phi/kernels/frobenius_norm_grad_kernel.h
+++ b/paddle/phi/kernels/frobenius_norm_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/frobenius_norm_kernel.h b/paddle/phi/kernels/frobenius_norm_kernel.h
index f5f37ee0c0f..30122cb4160 100644
--- a/paddle/phi/kernels/frobenius_norm_kernel.h
+++ b/paddle/phi/kernels/frobenius_norm_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index d5785f2eeda..228e862a09c 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -19,7 +19,6 @@
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index f80117ccec7..f481821a7bf 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -15,14 +15,14 @@
 #pragma once
 
 #include <glog/logging.h>
+
 #include <algorithm>
+#include <cmath>
 #include <memory>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-
-#include <cmath>
 #ifndef _USE_MATH_DEFINES
 #define _USE_MATH_DEFINES
 #endif
@@ -986,9 +986,9 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
             typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
-                       .template cast<T>();
+    dx.device(d) =
+        dout * ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
+                   .template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
@@ -1054,11 +1054,10 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
           GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
       auto ddout = EigenVector<T>::Flatten(
           GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
-      ddout.device(*d) =
-          ddx *
-          ((x > static_cast<T>(0)).template cast<T>() +
-           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
-              .template cast<T>();
+      ddout.device(*d) = ddx * ((x > static_cast<T>(0)).template cast<T>() +
+                                static_cast<T>(alpha) *
+                                    (x <= static_cast<T>(0)).template cast<T>())
+                                   .template cast<T>();
     }
   }
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
@@ -1290,11 +1289,10 @@ struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
     if (ddOut) {
       auto ddout = EigenVector<T>::Flatten(
           GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((x > static_cast<T>(0)).template cast<T>() +
-                          static_cast<T>(alpha) * x.exp() *
-                              (x <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
+      ddout.device(*d) = ddx * ((x > static_cast<T>(0)).template cast<T>() +
+                                static_cast<T>(alpha) * x.exp() *
+                                    (x <= static_cast<T>(0)).template cast<T>())
+                                   .template cast<T>();
     }
   }
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
@@ -1980,11 +1978,10 @@ struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
     if (ddOut) {
       auto ddout = EigenVector<T>::Flatten(
           GET_DATA_SAFELY(ddOut, "Output", "DDOut", "CELUGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((x > static_cast<T>(0)).template cast<T>() +
-                          (x / static_cast<T>(alpha)).exp() *
-                              (x <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
+      ddout.device(*d) = ddx * ((x > static_cast<T>(0)).template cast<T>() +
+                                (x / static_cast<T>(alpha)).exp() *
+                                    (x <= static_cast<T>(0)).template cast<T>())
+                                   .template cast<T>();
     }
   }
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
diff --git a/paddle/phi/kernels/funcs/adam_functors.h b/paddle/phi/kernels/funcs/adam_functors.h
index 2f706f0ef1c..b14ee7f072e 100644
--- a/paddle/phi/kernels/funcs/adam_functors.h
+++ b/paddle/phi/kernels/funcs/adam_functors.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <math.h>  // for sqrt in CPU and CUDA
+
 #include <Eigen/Dense>
 
 #include "paddle/phi/kernels/funcs/algorithm.h"
@@ -169,9 +170,8 @@ class AdamFunctor<T, CPUAdam> {
 
     moment1_out = beta1_ * mom1 + (1 - beta1_) * g;
     moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g;
-    param_out = param -
-                lr * (moment1_out /
-                      (moment2_out.sqrt() + epsilon_ * sqrt(1 - beta2_pow)));
+    param_out = param - lr * (moment1_out / (moment2_out.sqrt() +
+                                             epsilon_ * sqrt(1 - beta2_pow)));
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h
index 14a9560b841..70f75d5352a 100644
--- a/paddle/phi/kernels/funcs/aligned_vector.h
+++ b/paddle/phi/kernels/funcs/aligned_vector.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/phi/core/hostdevice.h"
 #if defined(__xpu__)
 #define CHAR_BIT 8
@@ -45,11 +46,11 @@ HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
 }
 
 /*
-* Only the address of input data is the multiplier of 1,2,4, vectorized load
-* with corresponding multiplier-value is possible. Moreover, the maximum length
-* of vectorized load is 128 bits once. Hence, valid length of vectorized load
-* shall be determined under both former constraints.
-*/
+ * Only the address of input data is the multiplier of 1,2,4, vectorized load
+ * with corresponding multiplier-value is possible. Moreover, the maximum length
+ * of vectorized load is 128 bits once. Hence, valid length of vectorized load
+ * shall be determined under both former constraints.
+ */
 template <typename T>
 int GetVectorizedSize(const T* pointer) {
   constexpr int max_load_bits = 128;
@@ -60,11 +61,11 @@ int GetVectorizedSize(const T* pointer) {
   constexpr int vec2 = std::alignment_of<AlignedVector<T, 2>>::value;  // NOLINT
   if (address % vec8 == 0) {
     /*
-    * Currently, decide to deal with no more than 4 data once while adopting
-    * vectorization load/store, if performance test shows that dealing with
-    * 8 data once in vectorization load/store does get optimized, return code
-    * below can be changed into " return std::min(8, valid_vec_size); " .
-    */
+     * Currently, decide to deal with no more than 4 data once while adopting
+     * vectorization load/store, if performance test shows that dealing with
+     * 8 data once in vectorization load/store does get optimized, return code
+     * below can be changed into " return std::min(8, valid_vec_size); " .
+     */
     return std::min(4, valid_vec_size);
   } else if (address % vec4 == 0) {
     return std::min(4, valid_vec_size);
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index e2b16a1eb7f..3e197a18f96 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/dynload/cublas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_bool(gemm_use_half_precision_compute_type);
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index ecdfa7abcfd..88b87c07c76 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -456,21 +456,16 @@ void LaunchBroadcastKernel(
                     read_lens * gpu_config.GetBlockSize();
   int tail_tid = numel % (read_lens * gpu_config.GetBlockSize());
 #endif
-  VectorizedBroadcastKernel<InT,
-                            OutT,
-                            Functor,
-                            Arity,
-                            NumOuts,
-                            VecSize><<<blocks, threads, 0, stream>>>(
-      ins_data,
-      outs_data,
-      use_broadcast,
-      numel,
-      configs,
-      main_offset,
-      tail_tid,
-      read_lens,
-      func);
+  VectorizedBroadcastKernel<InT, OutT, Functor, Arity, NumOuts, VecSize>
+      <<<blocks, threads, 0, stream>>>(ins_data,
+                                       outs_data,
+                                       use_broadcast,
+                                       numel,
+                                       configs,
+                                       main_offset,
+                                       tail_tid,
+                                       read_lens,
+                                       func);
 }
 
 template <ElementwiseType ET,
@@ -589,10 +584,9 @@ void BroadcastKernel(const KPDevice &ctx,
     dims_size.emplace_back(in->dims().size());
   }
 
-  axis = axis == -1
-             ? *std::max_element(dims_size.begin(), dims_size.end()) -
-                   *std::min_element(dims_size.begin(), dims_size.end())
-             : axis;
+  axis = axis == -1 ? *std::max_element(dims_size.begin(), dims_size.end()) -
+                          *std::min_element(dims_size.begin(), dims_size.end())
+                    : axis;
   BroadcastKernelForDifferentVecSize<ET, InT, OutT, Functor, NumOuts>(
       ctx, ins, outs, axis, func);
 }
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 06be592dd93..5abaf6c2ffa 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
-
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
index ea256e93bba..48858fa5939 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
@@ -60,14 +60,12 @@ inline void ModulatedDeformableIm2colCPUKernel(
     const T* data_im_ptr =
         data_im + (b_col * num_channels + c_im) * height * width;
     const T* data_offset_ptr =
-        data_offset +
-        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
-            kernel_w * height_col * width_col;
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const T* data_mask_ptr =
         data_mask
-            ? data_mask +
-                  (b_col * deformable_group + deformable_group_index) *
-                      kernel_h * kernel_w * height_col * width_col
+            ? data_mask + (b_col * deformable_group + deformable_group_index) *
+                              kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
     for (int i = 0; i < kernel_h; ++i) {
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
index 8bfb46c6636..bebea5dcb74 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
 
 namespace phi {
 namespace funcs {
@@ -70,14 +69,12 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
     const T* data_im_ptr =
         data_im + (b_col * num_channels + c_im) * height * width;
     const T* data_offset_ptr =
-        data_offset +
-        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
-            kernel_w * height_col * width_col;
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const T* data_mask_ptr =
         data_mask
-            ? data_mask +
-                  (b_col * deformable_group + deformable_group_index) *
-                      kernel_h * kernel_w * height_col * width_col
+            ? data_mask + (b_col * deformable_group + deformable_group_index) *
+                              kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
     for (int i = 0; i < kernel_h; ++i) {
@@ -129,28 +126,28 @@ void ModulatedDeformableIm2col(const Context& dev_ctx,
   int blocks = NumBlocks(num_kernels);
   int threads = kNumCUDAThreads;
 
-  ModulatedDeformableIm2colGpuKernel<
-      T><<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
-                                                   data_im,
-                                                   data_offset,
-                                                   data_mask,
-                                                   im_shape[1],
-                                                   im_shape[2],
-                                                   filter_shape[2],
-                                                   filter_shape[3],
-                                                   paddings[0],
-                                                   paddings[1],
-                                                   strides[0],
-                                                   strides[1],
-                                                   dilations[0],
-                                                   dilations[1],
-                                                   channel_per_deformable_group,
-                                                   col_shape[1],
-                                                   im_shape[0],
-                                                   deformable_groups,
-                                                   col_shape[2],
-                                                   col_shape[3],
-                                                   data_col);
+  ModulatedDeformableIm2colGpuKernel<T>
+      <<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
+                                                 data_im,
+                                                 data_offset,
+                                                 data_mask,
+                                                 im_shape[1],
+                                                 im_shape[2],
+                                                 filter_shape[2],
+                                                 filter_shape[3],
+                                                 paddings[0],
+                                                 paddings[1],
+                                                 strides[0],
+                                                 strides[1],
+                                                 dilations[0],
+                                                 dilations[1],
+                                                 channel_per_deformable_group,
+                                                 col_shape[1],
+                                                 im_shape[0],
+                                                 deformable_groups,
+                                                 col_shape[2],
+                                                 col_shape[3],
+                                                 data_col);
 }
 
 template void ModulatedDeformableIm2col(
diff --git a/paddle/phi/kernels/funcs/detail/activation_functions.h b/paddle/phi/kernels/funcs/detail/activation_functions.h
index 475557f1642..d41dca33f75 100644
--- a/paddle/phi/kernels/funcs/detail/activation_functions.h
+++ b/paddle/phi/kernels/funcs/detail/activation_functions.h
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <math.h>
+
 #include <stdexcept>
 #include <string>
+
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/phi/core/hostdevice.h"
 
diff --git a/paddle/phi/kernels/funcs/detail/avx_mathfun.h b/paddle/phi/kernels/funcs/detail/avx_mathfun.h
index e5e7388d51d..75e4922648c 100644
--- a/paddle/phi/kernels/funcs/detail/avx_mathfun.h
+++ b/paddle/phi/kernels/funcs/detail/avx_mathfun.h
@@ -356,11 +356,11 @@ v8sf sin256_ps(v8sf x) {  // any x
   /* scale by 4/Pi */
   y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
 
-/*
-  Here we start a series of integer operations, which are in the
-  realm of AVX2.
-  If we don't have AVX, let's perform them using SSE2 directives
-*/
+  /*
+    Here we start a series of integer operations, which are in the
+    realm of AVX2.
+    If we don't have AVX, let's perform them using SSE2 directives
+  */
 
 #ifdef __AVX2__
   /* store the integer part of y in mm0 */
diff --git a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
index 0016bfb64c9..0fdf490c553 100644
--- a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
index 6657417beac..93232d8f7f4 100644
--- a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
+
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
diff --git a/paddle/phi/kernels/funcs/detail/gru_kernel.h b/paddle/phi/kernels/funcs/detail/gru_kernel.h
index db53fc4576d..9e2aef19406 100644
--- a/paddle/phi/kernels/funcs/detail/gru_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
+
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 
diff --git a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
index ed8e749f7fd..02fddc57b31 100644
--- a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
diff --git a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
index 6d4c430d9e6..5d06dddd964 100644
--- a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
@@ -249,27 +249,27 @@ void gpu_lstm_forward(const paddle::platform::DeviceContext& context,
   if (batch_size == 1) {
     KeLstmForward<T,
                   Op,
-                  /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op,
-        value,
-        frame_size,
-        batch_size,
-        cell_clip,
-        active_node,
-        active_gate,
-        active_state);
+                  /* is_batch= */ false>
+        <<<grid, threads, 0, stream>>>(op,
+                                       value,
+                                       frame_size,
+                                       batch_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state);
   } else {
     KeLstmForward<T,
                   Op,
-                  /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op,
-        value,
-        frame_size,
-        batch_size,
-        cell_clip,
-        active_node,
-        active_gate,
-        active_state);
+                  /* is_batch= */ true>
+        <<<grid, threads, 0, stream>>>(op,
+                                       value,
+                                       frame_size,
+                                       batch_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state);
   }
 }
 
@@ -303,29 +303,29 @@ void gpu_lstm_backward(const paddle::platform::DeviceContext& context,
   if (batch_size == 1) {
     KeLstmBackward<T,
                    Op,
-                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op,
-        value,
-        grad,
-        frame_size,
-        batch_size,
-        cell_clip,
-        active_node,
-        active_gate,
-        active_state);
+                   /* is_batch= */ false>
+        <<<grid, threads, 0, stream>>>(op,
+                                       value,
+                                       grad,
+                                       frame_size,
+                                       batch_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state);
   } else {
     KeLstmBackward<T,
                    Op,
-                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op,
-        value,
-        grad,
-        frame_size,
-        batch_size,
-        cell_clip,
-        active_node,
-        active_gate,
-        active_state);
+                   /* is_batch= */ true>
+        <<<grid, threads, 0, stream>>>(op,
+                                       value,
+                                       grad,
+                                       frame_size,
+                                       batch_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state);
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/detail/lstm_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_kernel.h
index 8b429264125..0846f05a0c2 100644
--- a/paddle/phi/kernels/funcs/detail/lstm_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <type_traits>
+
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 
diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h
index 19a93970d09..81525cb2544 100644
--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -17,6 +17,7 @@
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 #endif
 
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index 68e986c334e..0e6b3a3f9d7 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -319,10 +319,9 @@ void distribution_and_transform(const GPUContext &ctx,
   uint64_t seed = seed_offset.first;
   uint64_t offset = seed_offset.second;
 
-  DistributionKernel<T,
-                     DistOp,
-                     TransformOp><<<grid_size, block_size, 0, ctx.stream()>>>(
-      size, seed, offset, dist, trans, out_data, total_thread);
+  DistributionKernel<T, DistOp, TransformOp>
+      <<<grid_size, block_size, 0, ctx.stream()>>>(
+          size, seed, offset, dist, trans, out_data, total_thread);
 }
 
 #endif
diff --git a/paddle/phi/kernels/funcs/eigen/extensions.h b/paddle/phi/kernels/funcs/eigen/extensions.h
index fbb9d8e3d2e..c724564417b 100644
--- a/paddle/phi/kernels/funcs/eigen/extensions.h
+++ b/paddle/phi/kernels/funcs/eigen/extensions.h
@@ -20,7 +20,6 @@
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/hostdevice.h"
-
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 1093bdfa726..71dfbc206a1 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -494,7 +494,7 @@ template <template <int Index, int VecSize> typename Func,
           int Begin = 0>
 struct Unroller {
   template <typename... Args>
-  static HOSTDEVICE inline void step(Args &&... args) {
+  static HOSTDEVICE inline void step(Args &&...args) {
     Func<Begin, VecSize>::Apply(std::forward<Args>(args)...);
     Unroller<Func, VecSize, End, Begin + 1>::step(args...);
   }
@@ -503,7 +503,7 @@ struct Unroller {
 template <template <int Index, int VecSize> typename Func, int VecSize, int End>
 struct Unroller<Func, VecSize, End, End> {
   template <typename... Args>
-  static HOSTDEVICE inline void step(Args &&... args) {}
+  static HOSTDEVICE inline void step(Args &&...args) {}
 };
 
 template <int Index, int VecSize>
@@ -818,23 +818,18 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
   int grid_size = 8;
   auto stream = ctx.x_context()->xpu_stream;
   int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
-  VectorizedElementwiseKernel<OutT,
-                              Functor,
-                              Arity,
-                              NumOuts,
-                              VecSize><<<grid_size, block_size, 0, stream>>>(
-      ins_data, outs_data, numel, main_offset, func);
+  VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
+      <<<grid_size, block_size, 0, stream>>>(
+          ins_data, outs_data, numel, main_offset, func);
 #else
   auto gpu_config =
       phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
   int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
                     gpu_config.GetBlockSize();
   auto stream = ctx.stream();
-  VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize><<<
-      gpu_config.block_per_grid,
-      gpu_config.thread_per_block,
-      0,
-      stream>>>(ins_data, outs_data, numel, main_offset, func);
+  VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
+      <<<gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
+          ins_data, outs_data, numel, main_offset, func);
 #endif
 }
 
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 70b11bc8c90..e30ab8716b6 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/hostdevice.h"
 #if defined(__xpu__)
 #include <xpu/runtime.h>
+
 #include "xpu/kernel/math_xpu2.h"  //pow()
 #endif
 
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index 7508d8ee8cd..9ca21b967a4 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -1314,8 +1314,9 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
     }
   };
 
-  auto FastBroadCastAllCUDAF = [&](
-      const std::vector<int> &broadcast_pos, int max_dim, bool is_x_large) {
+  auto FastBroadCastAllCUDAF = [&](const std::vector<int> &broadcast_pos,
+                                   int max_dim,
+                                   bool is_x_large) {
     int axis = broadcast_pos[0];
     int pre = std::accumulate(
         out_dims_array, out_dims_array + axis, 1, std::multiplies<int>());
@@ -1361,85 +1362,85 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
         dy_data);
   };
 
-  auto FastBroadCastOneCUDAF = [&](
-      const std::vector<int> &broadcast_pos, int max_dim, bool is_x) {
-    int axis = broadcast_pos[0];
-    int pre = std::accumulate(
-        out_dims_array, out_dims_array + axis, 1, std::multiplies<int>());
-    int mid = out_dims_array[axis];
-    int post = std::accumulate(out_dims_array + axis + 1,
-                               out_dims_array + max_dim,
-                               1,
-                               std::multiplies<int>());
-
-    int k_pre;
-    int k_mid;
-    int k_post;
-
-    if (is_x) {
-      k_pre = std::accumulate(
-          y_dims_array, y_dims_array + axis, 1, std::multiplies<int>());
-      k_mid = y_dims_array[axis];
-      k_post = std::accumulate(y_dims_array + axis + 1,
-                               y_dims_array + max_dim,
-                               1,
-                               std::multiplies<int>());
-      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
-      dim3 grid_size = dim3(pre * post);
-      paddle::platform::LimitGridDim(ctx, &grid_size);
-      // we need to calc y offset with blockid, so do x_pre/y_pre to get left
-      // size.
-      if (k_pre != pre) k_pre = pre / k_pre;
-
-      FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
-                                             block_size,
-                                             0,
-                                             stream>>>(x_data,
-                                                       y_data,
-                                                       out_data,
-                                                       dout_data,
-                                                       pre,
-                                                       mid,
-                                                       post,
-                                                       k_pre,
-                                                       k_mid,
-                                                       k_post,
-                                                       true,
-                                                       dx_op,
-                                                       dx_data);
-    } else {
-      k_pre = std::accumulate(
-          x_dims_array, x_dims_array + axis, 1, std::multiplies<int>());
-      k_mid = x_dims_array[axis];
-      k_post = std::accumulate(x_dims_array + axis + 1,
-                               x_dims_array + max_dim,
-                               1,
-                               std::multiplies<int>());
-      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
-      dim3 grid_size = dim3(pre * post);
-      paddle::platform::LimitGridDim(ctx, &grid_size);
-      if (k_pre != pre) k_pre = pre / k_pre;
-
-      FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
-                                             block_size,
-                                             0,
-                                             stream>>>(x_data,
-                                                       y_data,
-                                                       out_data,
-                                                       dout_data,
-                                                       pre,
-                                                       mid,
-                                                       post,
-                                                       k_pre,
-                                                       k_mid,
-                                                       k_post,
-                                                       false,
-                                                       dy_op,
-                                                       dy_data);
-    }
-    VLOG(3) << "FastBroadCastOneCUDAF pre:" << pre << " mid:" << mid
-            << " post:" << post;
-  };
+  auto FastBroadCastOneCUDAF =
+      [&](const std::vector<int> &broadcast_pos, int max_dim, bool is_x) {
+        int axis = broadcast_pos[0];
+        int pre = std::accumulate(
+            out_dims_array, out_dims_array + axis, 1, std::multiplies<int>());
+        int mid = out_dims_array[axis];
+        int post = std::accumulate(out_dims_array + axis + 1,
+                                   out_dims_array + max_dim,
+                                   1,
+                                   std::multiplies<int>());
+
+        int k_pre;
+        int k_mid;
+        int k_post;
+
+        if (is_x) {
+          k_pre = std::accumulate(
+              y_dims_array, y_dims_array + axis, 1, std::multiplies<int>());
+          k_mid = y_dims_array[axis];
+          k_post = std::accumulate(y_dims_array + axis + 1,
+                                   y_dims_array + max_dim,
+                                   1,
+                                   std::multiplies<int>());
+          int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
+          dim3 grid_size = dim3(pre * post);
+          paddle::platform::LimitGridDim(ctx, &grid_size);
+          // we need to calc y offset with blockid, so do x_pre/y_pre to get
+          // left size.
+          if (k_pre != pre) k_pre = pre / k_pre;
+
+          FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
+                                                 block_size,
+                                                 0,
+                                                 stream>>>(x_data,
+                                                           y_data,
+                                                           out_data,
+                                                           dout_data,
+                                                           pre,
+                                                           mid,
+                                                           post,
+                                                           k_pre,
+                                                           k_mid,
+                                                           k_post,
+                                                           true,
+                                                           dx_op,
+                                                           dx_data);
+        } else {
+          k_pre = std::accumulate(
+              x_dims_array, x_dims_array + axis, 1, std::multiplies<int>());
+          k_mid = x_dims_array[axis];
+          k_post = std::accumulate(x_dims_array + axis + 1,
+                                   x_dims_array + max_dim,
+                                   1,
+                                   std::multiplies<int>());
+          int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
+          dim3 grid_size = dim3(pre * post);
+          paddle::platform::LimitGridDim(ctx, &grid_size);
+          if (k_pre != pre) k_pre = pre / k_pre;
+
+          FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
+                                                 block_size,
+                                                 0,
+                                                 stream>>>(x_data,
+                                                           y_data,
+                                                           out_data,
+                                                           dout_data,
+                                                           pre,
+                                                           mid,
+                                                           post,
+                                                           k_pre,
+                                                           k_mid,
+                                                           k_post,
+                                                           false,
+                                                           dy_op,
+                                                           dy_data);
+        }
+        VLOG(3) << "FastBroadCastOneCUDAF pre:" << pre << " mid:" << mid
+                << " post:" << post;
+      };
 
   // do fast elementwise if: 1. only one input need to do broadcast, we can
   // fallback
@@ -1571,23 +1572,21 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                          x_dims_order.data(),
                          bytes,
                          ctx.stream());
-    CommonGradBroadcastCUDAKernel<
-        T,
-        DX_OP,
-        Tout><<<x_blocks, x_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
-                                                           y_strides_array_gpu,
-                                                           out_dims_array_gpu,
-                                                           x_strides_order_gpu,
-                                                           x_dims_order_gpu,
-                                                           x_data,
-                                                           y_data,
-                                                           out_data,
-                                                           dout_data,
-                                                           dx_data,
-                                                           out_size,
-                                                           max_dim,
-                                                           x_threads,
-                                                           dx_op);
+    CommonGradBroadcastCUDAKernel<T, DX_OP, Tout>
+        <<<x_blocks, x_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
+                                                      y_strides_array_gpu,
+                                                      out_dims_array_gpu,
+                                                      x_strides_order_gpu,
+                                                      x_dims_order_gpu,
+                                                      x_data,
+                                                      y_data,
+                                                      out_data,
+                                                      dout_data,
+                                                      dx_data,
+                                                      out_size,
+                                                      max_dim,
+                                                      x_threads,
+                                                      dx_op);
   }
   if (dy) {
     auto y_strides_order_tmp = paddle::memory::Alloc(ctx, bytes);
@@ -1608,23 +1607,21 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                          y_dims_order.data(),
                          bytes,
                          ctx.stream());
-    CommonGradBroadcastCUDAKernel<
-        T,
-        DY_OP,
-        Tout><<<y_blocks, y_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
-                                                           y_strides_array_gpu,
-                                                           out_dims_array_gpu,
-                                                           y_strides_order_gpu,
-                                                           y_dims_order_gpu,
-                                                           x_data,
-                                                           y_data,
-                                                           out_data,
-                                                           dout_data,
-                                                           dy_data,
-                                                           out_size,
-                                                           max_dim,
-                                                           y_threads,
-                                                           dy_op);
+    CommonGradBroadcastCUDAKernel<T, DY_OP, Tout>
+        <<<y_blocks, y_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
+                                                      y_strides_array_gpu,
+                                                      out_dims_array_gpu,
+                                                      y_strides_order_gpu,
+                                                      y_dims_order_gpu,
+                                                      x_data,
+                                                      y_data,
+                                                      out_data,
+                                                      dout_data,
+                                                      dy_data,
+                                                      out_size,
+                                                      max_dim,
+                                                      y_threads,
+                                                      dy_op);
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/fc_functor.cc b/paddle/phi/kernels/funcs/fc_functor.cc
index e14f8522c96..0fb38c971ab 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cc
+++ b/paddle/phi/kernels/funcs/fc_functor.cc
@@ -78,15 +78,14 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
         errors::PermissionDenied("When bias is NULL, relu can not be true."));
     return;
   }
-  auto compute = relu
-                     ? paddle::operators::jit::KernelFuncs<
-                           paddle::operators::jit::VAddReluTuple<T>,
-                           paddle::platform::CPUPlace>::Cache()
-                           .At(N)
-                     : paddle::operators::jit::KernelFuncs<
-                           paddle::operators::jit::VAddTuple<T>,
-                           paddle::platform::CPUPlace>::Cache()
-                           .At(N);
+  auto compute = relu ? paddle::operators::jit::KernelFuncs<
+                            paddle::operators::jit::VAddReluTuple<T>,
+                            paddle::platform::CPUPlace>::Cache()
+                            .At(N)
+                      : paddle::operators::jit::KernelFuncs<
+                            paddle::operators::jit::VAddTuple<T>,
+                            paddle::platform::CPUPlace>::Cache()
+                            .At(N);
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
index a26f0edcab2..b441ad58179 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
@@ -126,15 +127,11 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
     const int threads = 256;
     const int blocks = M;
     if (relu) {
-      InplaceAddReluKernel<T,
-                           true,
-                           threads><<<blocks, threads, 0, context.stream()>>>(
-          N, B, Y);
+      InplaceAddReluKernel<T, true, threads>
+          <<<blocks, threads, 0, context.stream()>>>(N, B, Y);
     } else {
-      InplaceAddReluKernel<T,
-                           false,
-                           threads><<<blocks, threads, 0, context.stream()>>>(
-          N, B, Y);
+      InplaceAddReluKernel<T, false, threads>
+          <<<blocks, threads, 0, context.stream()>>>(N, B, Y);
     }
   }
 }
diff --git a/paddle/phi/kernels/funcs/fc_functor.h b/paddle/phi/kernels/funcs/fc_functor.h
index 3c759acb194..e5ed0d709cd 100644
--- a/paddle/phi/kernels/funcs/fc_functor.h
+++ b/paddle/phi/kernels/funcs/fc_functor.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/platform/device_context.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h
index 617d249308c..fbffd0c1e2b 100644
--- a/paddle/phi/kernels/funcs/gather.cu.h
+++ b/paddle/phi/kernels/funcs/gather.cu.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/memory/memcpy.h"
 // TODO(paddle-dev): move gpu_primitives.h to phi
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
@@ -260,17 +261,16 @@ void GatherV2CUDAFunction(const DenseTensor* input,
 
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
   auto stream = ctx.stream();
-  GatherGPUKernel<
-      T,
-      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_data,
-      index_data,
-      out_data,
-      outer_dim_size,
-      inner_dim_size,
-      index_size,
-      index_dim_size,
-      out_size);
+  GatherGPUKernel<T, U>
+      <<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+          input_data,
+          index_data,
+          out_data,
+          outer_dim_size,
+          inner_dim_size,
+          index_size,
+          index_dim_size,
+          out_size);
 }
 
 template <typename T, typename U>
@@ -306,17 +306,16 @@ void GatherV2GradCUDAFunction(const DenseTensor* input,
 
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, input_size);
   auto stream = ctx.stream();
-  GatherGradGPUKernel<
-      T,
-      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_data,
-      index_data,
-      out_data,
-      outer_dim_size,
-      inner_dim_size,
-      input_index_dim_size,
-      out_index_dim_size,
-      input_size);
+  GatherGradGPUKernel<T, U>
+      <<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+          input_data,
+          index_data,
+          out_data,
+          outer_dim_size,
+          inner_dim_size,
+          input_index_dim_size,
+          out_index_dim_size,
+          input_size);
 }
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/gather.h b/paddle/phi/kernels/funcs/gather.h
index 740042c999a..094bc46cb6f 100644
--- a/paddle/phi/kernels/funcs/gather.h
+++ b/paddle/phi/kernels/funcs/gather.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <memory.h>
+
 #include <cstring>
 #include <vector>
 
diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
index 7666206b7f7..bbc3fdaeeac 100644
--- a/paddle/phi/kernels/funcs/gru_compute.cu
+++ b/paddle/phi/kernels/funcs/gru_compute.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <paddle/fluid/platform/device_context.h>
+
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
 #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
@@ -37,57 +38,49 @@ struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, T> {
           int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
           threads = dim3(tiled_size, 1);
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruGate<
-              T,
-              tiled_size><<<grid, threads, 0, stream>>>(
-              value.gate_value,
-              value.prev_out_value,
-              value.gate_weight,
-              value.reset_output_value,
-              frame_size,
-              active_gate);
+          detail::KeFastCollectiveGruGate<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(value.gate_value,
+                                             value.prev_out_value,
+                                             value.gate_weight,
+                                             value.reset_output_value,
+                                             frame_size,
+                                             active_gate);
 
           frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruOut<
-              T,
-              tiled_size><<<grid, threads, 0, stream>>>(
-              value.state_weight,
-              value.prev_out_value,
-              value.output_value,
-              value.gate_value,
-              value.reset_output_value,
-              frame_size,
-              active_node,
-              origin_mode);
+          detail::KeFastCollectiveGruOut<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(value.state_weight,
+                                             value.prev_out_value,
+                                             value.output_value,
+                                             value.gate_value,
+                                             value.reset_output_value,
+                                             frame_size,
+                                             active_node,
+                                             origin_mode);
         } else {
           constexpr int tiled_size = 16;
           int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
           threads = dim3(tiled_size, 1);
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruGate<
-              T,
-              tiled_size><<<grid, threads, 0, stream>>>(
-              value.gate_value,
-              value.prev_out_value,
-              value.gate_weight,
-              value.reset_output_value,
-              frame_size,
-              active_gate);
+          detail::KeFastCollectiveGruGate<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(value.gate_value,
+                                             value.prev_out_value,
+                                             value.gate_weight,
+                                             value.reset_output_value,
+                                             frame_size,
+                                             active_gate);
 
           frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
           grid = dim3(frame_blocks, 1);
-          detail::KeFastCollectiveGruOut<
-              T,
-              tiled_size><<<grid, threads, 0, stream>>>(
-              value.state_weight,
-              value.prev_out_value,
-              value.output_value,
-              value.gate_value,
-              value.reset_output_value,
-              frame_size,
-              active_node,
-              origin_mode);
+          detail::KeFastCollectiveGruOut<T, tiled_size>
+              <<<grid, threads, 0, stream>>>(value.state_weight,
+                                             value.prev_out_value,
+                                             value.output_value,
+                                             value.gate_value,
+                                             value.reset_output_value,
+                                             frame_size,
+                                             active_node,
+                                             origin_mode);
         }
         return;
       } else {
diff --git a/paddle/phi/kernels/funcs/inclusive_scan.h b/paddle/phi/kernels/funcs/inclusive_scan.h
index b285c5bdbbf..0f97b244bf0 100644
--- a/paddle/phi/kernels/funcs/inclusive_scan.h
+++ b/paddle/phi/kernels/funcs/inclusive_scan.h
@@ -24,6 +24,7 @@ namespace cub = hipcub;
 
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/reverse_iterator.h>
+
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
@@ -214,21 +215,21 @@ static void InclusiveScanInnerDim(const T *x,
   grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
   dim3 thread_dims(kThreadNumX, kThreadNumY);
   if (reverse) {
-    InclusiveScanInnerDimCUDAKernel<
-        T,
-        BinaryOp,
-        kThreadNumX,
-        kThreadNumY,
-        /*kReverse=*/true><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
-        x, y, outer_dim, inner_dim, init, op);
+    InclusiveScanInnerDimCUDAKernel<T,
+                                    BinaryOp,
+                                    kThreadNumX,
+                                    kThreadNumY,
+                                    /*kReverse=*/true>
+        <<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
+            x, y, outer_dim, inner_dim, init, op);
   } else {
-    InclusiveScanInnerDimCUDAKernel<
-        T,
-        BinaryOp,
-        kThreadNumX,
-        kThreadNumY,
-        /*kReverse=*/false><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
-        x, y, outer_dim, inner_dim, init, op);
+    InclusiveScanInnerDimCUDAKernel<T,
+                                    BinaryOp,
+                                    kThreadNumX,
+                                    kThreadNumY,
+                                    /*kReverse=*/false>
+        <<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
+            x, y, outer_dim, inner_dim, init, op);
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/index_impl.cu.h b/paddle/phi/kernels/funcs/index_impl.cu.h
index ccb70fe25dd..f90380bef70 100644
--- a/paddle/phi/kernels/funcs/index_impl.cu.h
+++ b/paddle/phi/kernels/funcs/index_impl.cu.h
@@ -71,16 +71,16 @@ void IndexKernel(const KPDevice &dev_ctx, DenseTensor *out, Functor func) {
   size_t main_offset = (numel / (vec_size * block)) * vec_size * block;
   switch (vec_size) {
     case 4:
-      VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
-          out_data, numel, main_offset, func);
+      VectorizedIndexKernel<T, Functor, 4>
+          <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     case 2:
-      VectorizedIndexKernel<T, Functor, 2><<<grid, block, 0, stream>>>(
-          out_data, numel, main_offset, func);
+      VectorizedIndexKernel<T, Functor, 2>
+          <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     case 1:
-      VectorizedIndexKernel<T, Functor, 1><<<grid, block, 0, stream>>>(
-          out_data, numel, main_offset, func);
+      VectorizedIndexKernel<T, Functor, 1>
+          <<<grid, block, 0, stream>>>(out_data, numel, main_offset, func);
       break;
     default: {
       PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/paddle/phi/kernels/funcs/lapack/lapack_function.cc b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
index 0f887dce4b4..247bb52153c 100644
--- a/paddle/phi/kernels/funcs/lapack/lapack_function.cc
+++ b/paddle/phi/kernels/funcs/lapack/lapack_function.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+
 #include "paddle/phi/backends/dynload/lapack.h"
 #include "paddle/phi/common/complex.h"
 
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index afa2214f5b9..25f22254665 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index df2af82d551..42ba0ba7113 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -227,14 +228,14 @@ struct TransposeNormal<phi::GPUContext, T> {
                          : (1 << static_cast<int>(std::log2(elements)));
     int grid_size = elements / block_size;
     grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
-    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
-        in_ptr,
-        out_ptr,
-        elements,
-        in_stride_ptr,
-        out_stride_ptr,
-        axis_ptr,
-        rank);
+    TransposeNormalKernel<T>
+        <<<grid_size, block_size, 0, context.stream()>>>(in_ptr,
+                                                         out_ptr,
+                                                         elements,
+                                                         in_stride_ptr,
+                                                         out_stride_ptr,
+                                                         axis_ptr,
+                                                         rank);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/math_function_impl.h b/paddle/phi/kernels/funcs/math_function_impl.h
index 1638d03e50f..7c337e6c0db 100644
--- a/paddle/phi/kernels/funcs/math_function_impl.h
+++ b/paddle/phi/kernels/funcs/math_function_impl.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
index 686b8405bf7..eef355e6884 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
 
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h
index 1c6756f1720..f0cd265a546 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.h
+++ b/paddle/phi/kernels/funcs/matrix_inverse.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "Eigen/Core"
 #include "Eigen/LU"
-
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/paddle/phi/kernels/funcs/padding.h b/paddle/phi/kernels/funcs/padding.h
index e2c4e766b60..d6faa5f824c 100644
--- a/paddle/phi/kernels/funcs/padding.h
+++ b/paddle/phi/kernels/funcs/padding.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/funcs/pooling.cc b/paddle/phi/kernels/funcs/pooling.cc
index 10c88b9798c..acc9a9c095c 100644
--- a/paddle/phi/kernels/funcs/pooling.cc
+++ b/paddle/phi/kernels/funcs/pooling.cc
@@ -16,18 +16,19 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace phi {
 namespace funcs {
 
 /*
-* Tensors are in NCHW or NHWC format.
-* Ksize, strides are two elements. These two elements represent height
-* and width, respectively.
-* Paddings are four elements. These four elements represent height_up,
-* height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCHW or NHWC format.
+ * Ksize, strides are two elements. These two elements represent height
+ * and width, respectively.
+ * Paddings are four elements. These four elements represent height_up,
+ * height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, typename T>
 class Pool2dFunctor<CPUContext, PoolProcess, T> {
  public:
@@ -248,12 +249,12 @@ class Pool2dFunctor<CPUContext, PoolProcess, T> {
 };
 
 /*
-* tensors are in NCHW or NHWC format.
-* Ksize, strides are two elements. These two elements represent height
-* and width, respectively.
-* Paddings are four elements. These four elements represent height_up,
-* height_down, width_left and width_right, respectively.
-*/
+ * tensors are in NCHW or NHWC format.
+ * Ksize, strides are two elements. These two elements represent height
+ * and width, respectively.
+ * Paddings are four elements. These four elements represent height_up,
+ * height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, class T>
 class Pool2dGradFunctor<CPUContext, PoolProcess, T> {
  public:
@@ -492,12 +493,12 @@ class Pool2dGradFunctor<CPUContext, PoolProcess, T> {
 };
 
 /*
-* Tensors are in NCHW or NHWC format.
-* Ksize, strides are two elements. These two elements represent height
-* and width, respectively.
-* Paddings are four elements. These four elements represent height_up,
-* height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCHW or NHWC format.
+ * Ksize, strides are two elements. These two elements represent height
+ * and width, respectively.
+ * Paddings are four elements. These four elements represent height_up,
+ * height_down, width_left and width_right, respectively.
+ */
 template <class T>
 class MaxPool2dGradFunctor<CPUContext, T> {
  public:
@@ -682,13 +683,13 @@ template class Pool2dGradFunctor<CPUContext, MaxPoolGrad<double>, double>;
 template class Pool2dGradFunctor<CPUContext, AvgPoolGrad<double>, double>;
 
 /*
-* Tensors are in NCDHW or NDHWC format.
-* Ksize, strides, paddings are three elements. These three elements represent
-* depth, height and width, respectively.
-* Paddings are six elements. These six elements represent depth_forth,
-* depth_back,
-* height_up, height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCDHW or NDHWC format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ * Paddings are six elements. These six elements represent depth_forth,
+ * depth_back,
+ * height_up, height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, class T>
 class Pool3dFunctor<CPUContext, PoolProcess, T> {
  public:
@@ -981,13 +982,13 @@ class Pool3dFunctor<CPUContext, PoolProcess, T> {
 };
 
 /*
-* Tensors are in NCDHW or NDHWC format.
-* Ksize, strides, paddings are three elements. These three elements represent
-* depth, height and width, respectively.
-* Paddings are six elements. These six elements represent depth_forth,
-* depth_back,
-* height_up, height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCDHW or NDHWC format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ * Paddings are six elements. These six elements represent depth_forth,
+ * depth_back,
+ * height_up, height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, class T>
 class Pool3dGradFunctor<CPUContext, PoolProcess, T> {
  public:
@@ -1298,13 +1299,13 @@ class Pool3dGradFunctor<CPUContext, PoolProcess, T> {
 };
 
 /*
-* Tensors are in NCDHW or NDHWC format.
-* Ksize, strides, paddings are three elements. These three elements represent
-* depth, height and width, respectively.
-* Paddings are six elements. These six elements represent depth_forth,
-* depth_back,
-* height_up, height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCDHW or NDHWC format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ * Paddings are six elements. These six elements represent depth_forth,
+ * depth_back,
+ * height_up, height_down, width_left and width_right, respectively.
+ */
 template <class T>
 class MaxPool3dGradFunctor<CPUContext, T> {
  public:
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
index b0e68abc08a..6e4fc414afd 100644
--- a/paddle/phi/kernels/funcs/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/funcs/pooling.h"
-
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 namespace phi {
 namespace funcs {
@@ -468,25 +468,25 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
 
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
-    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        input_channels,
-        input_height,
-        input_width,
-        output_height,
-        output_width,
-        ksize_height,
-        ksize_width,
-        stride_height,
-        stride_width,
-        padding_height,
-        padding_width,
-        pool_divmods,
-        pool_process,
-        exclusive,
-        adaptive,
-        output_data);
+    KernelPool2D<PoolProcess, T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 input_channels,
+                                                 input_height,
+                                                 input_width,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_height,
+                                                 padding_width,
+                                                 pool_divmods,
+                                                 pool_process,
+                                                 exclusive,
+                                                 adaptive,
+                                                 output_data);
   }
   void operator()(const phi::GPUContext& context,
                   const DenseTensor& input,
@@ -535,26 +535,26 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
 
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
-    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        input_channels,
-        input_height,
-        input_width,
-        output_height,
-        output_width,
-        ksize_height,
-        ksize_width,
-        stride_height,
-        stride_width,
-        padding_height,
-        padding_width,
-        pool_divmods,
-        pool_process,
-        exclusive,
-        adaptive,
-        output_data,
-        channel_last);
+    KernelPool2D<PoolProcess, T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 input_channels,
+                                                 input_height,
+                                                 input_width,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_height,
+                                                 padding_width,
+                                                 pool_divmods,
+                                                 pool_process,
+                                                 exclusive,
+                                                 adaptive,
+                                                 output_data,
+                                                 channel_last);
   }
 };
 /*
@@ -748,24 +748,24 @@ class MaxPool2dGradFunctor<phi::GPUContext, T> {
 
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
-    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        output_data,
-        output_grad_data,
-        input_channels,
-        input_height,
-        input_width,
-        output_height,
-        output_width,
-        ksize_height,
-        ksize_width,
-        stride_height,
-        stride_width,
-        padding_height,
-        padding_width,
-        input_grad_data,
-        pool_divmods);
+    KernelMaxPool2DGrad<T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 output_data,
+                                                 output_grad_data,
+                                                 input_channels,
+                                                 input_height,
+                                                 input_width,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_height,
+                                                 padding_width,
+                                                 input_grad_data,
+                                                 pool_divmods);
   }
   void operator()(const phi::GPUContext& context,
                   const DenseTensor& input,
@@ -812,25 +812,25 @@ class MaxPool2dGradFunctor<phi::GPUContext, T> {
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
 
-    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        output_data,
-        output_grad_data,
-        input_channels,
-        input_height,
-        input_width,
-        output_height,
-        output_width,
-        ksize_height,
-        ksize_width,
-        stride_height,
-        stride_width,
-        padding_height,
-        padding_width,
-        input_grad_data,
-        pool_divmods,
-        channel_last);
+    KernelMaxPool2DGrad<T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 output_data,
+                                                 output_grad_data,
+                                                 input_channels,
+                                                 input_height,
+                                                 input_width,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_height,
+                                                 padding_width,
+                                                 input_grad_data,
+                                                 pool_divmods,
+                                                 channel_last);
   }
 };
 
@@ -1299,29 +1299,29 @@ class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        input_channels,
-        input_depth,
-        input_height,
-        input_width,
-        output_depth,
-        output_height,
-        output_width,
-        ksize_depth,
-        ksize_height,
-        ksize_width,
-        stride_depth,
-        stride_height,
-        stride_width,
-        padding_depth,
-        padding_height,
-        padding_width,
-        pool_process,
-        exclusive,
-        adaptive,
-        output_data);
+    KernelPool3D<PoolProcess, T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 input_channels,
+                                                 input_depth,
+                                                 input_height,
+                                                 input_width,
+                                                 output_depth,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_depth,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_depth,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_depth,
+                                                 padding_height,
+                                                 padding_width,
+                                                 pool_process,
+                                                 exclusive,
+                                                 adaptive,
+                                                 output_data);
   }
   void operator()(const phi::GPUContext& context,
                   const DenseTensor& input,
@@ -1375,30 +1375,30 @@ class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        input_channels,
-        input_depth,
-        input_height,
-        input_width,
-        output_depth,
-        output_height,
-        output_width,
-        ksize_depth,
-        ksize_height,
-        ksize_width,
-        stride_depth,
-        stride_height,
-        stride_width,
-        padding_depth,
-        padding_height,
-        padding_width,
-        pool_process,
-        exclusive,
-        adaptive,
-        output_data,
-        channel_last);
+    KernelPool3D<PoolProcess, T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 input_channels,
+                                                 input_depth,
+                                                 input_height,
+                                                 input_width,
+                                                 output_depth,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_depth,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_depth,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_depth,
+                                                 padding_height,
+                                                 padding_width,
+                                                 pool_process,
+                                                 exclusive,
+                                                 adaptive,
+                                                 output_data,
+                                                 channel_last);
   }
 };
 
@@ -1454,31 +1454,31 @@ class Pool3dGradFunctor<phi::GPUContext, PoolProcess, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelPool3DGrad<T, PoolProcess><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        output_data,
-        output_grad_data,
-        input_channels,
-        input_depth,
-        input_height,
-        input_width,
-        output_depth,
-        output_height,
-        output_width,
-        ksize_depth,
-        ksize_height,
-        ksize_width,
-        stride_depth,
-        stride_height,
-        stride_width,
-        padding_depth,
-        padding_height,
-        padding_width,
-        pool_process,
-        exclusive,
-        adaptive,
-        input_grad_data);
+    KernelPool3DGrad<T, PoolProcess>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 output_data,
+                                                 output_grad_data,
+                                                 input_channels,
+                                                 input_depth,
+                                                 input_height,
+                                                 input_width,
+                                                 output_depth,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_depth,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_depth,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_depth,
+                                                 padding_height,
+                                                 padding_width,
+                                                 pool_process,
+                                                 exclusive,
+                                                 adaptive,
+                                                 input_grad_data);
   }
   void operator()(const phi::GPUContext& context,
                   const DenseTensor& input,
@@ -1608,28 +1608,28 @@ class MaxPool3dGradFunctor<phi::GPUContext, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        output_data,
-        output_grad_data,
-        input_channels,
-        input_depth,
-        input_height,
-        input_width,
-        output_depth,
-        output_height,
-        output_width,
-        ksize_depth,
-        ksize_height,
-        ksize_width,
-        stride_depth,
-        stride_height,
-        stride_width,
-        padding_depth,
-        padding_height,
-        padding_width,
-        input_grad_data);
+    KernelMaxPool3DGrad<T>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 output_data,
+                                                 output_grad_data,
+                                                 input_channels,
+                                                 input_depth,
+                                                 input_height,
+                                                 input_width,
+                                                 output_depth,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_depth,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_depth,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_depth,
+                                                 padding_height,
+                                                 padding_width,
+                                                 input_grad_data);
   }
   void operator()(const phi::GPUContext& context,
                   const DenseTensor& input,
@@ -1915,24 +1915,24 @@ class MaxPool2dWithIndexFunctor<phi::GPUContext, T1, T2> {
 
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
-    KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        input_channels,
-        input_height,
-        input_width,
-        output_height,
-        output_width,
-        ksize_height,
-        ksize_width,
-        stride_height,
-        stride_width,
-        padding_height,
-        padding_width,
-        adaptive,
-        output_data,
-        mask_data,
-        pool_divmods);
+    KernelMaxPool2dWithIdx<T1, T2>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 input_channels,
+                                                 input_height,
+                                                 input_width,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_height,
+                                                 padding_width,
+                                                 adaptive,
+                                                 output_data,
+                                                 mask_data,
+                                                 pool_divmods);
   }
 };
 
@@ -1976,24 +1976,24 @@ class MaxPool2dWithIndexGradFunctor<phi::GPUContext, T1, T2> {
 
     auto pool_divmods =
         FastDivModForPooling(input_channels, input_width, input_height);
-    KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        output_grad_data,
-        mask_data,
-        input_channels,
-        input_height,
-        input_width,
-        output_height,
-        output_width,
-        ksize_height,
-        ksize_width,
-        stride_height,
-        stride_width,
-        padding_height,
-        padding_width,
-        adaptive,
-        input_grad_data,
-        pool_divmods);
+    KernelMaxPool2DWithIdxGrad<T1, T2>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 output_grad_data,
+                                                 mask_data,
+                                                 input_channels,
+                                                 input_height,
+                                                 input_width,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_height,
+                                                 padding_width,
+                                                 adaptive,
+                                                 input_grad_data,
+                                                 pool_divmods);
   }
 };
 
@@ -2212,28 +2212,28 @@ class MaxPool3dWithIndexFunctor<phi::GPUContext, T1, T2> {
     dim3 threads(thread_num, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        input_data,
-        input_channels,
-        input_depth,
-        input_height,
-        input_width,
-        output_depth,
-        output_height,
-        output_width,
-        ksize_depth,
-        ksize_height,
-        ksize_width,
-        stride_depth,
-        stride_height,
-        stride_width,
-        padding_depth,
-        padding_height,
-        padding_width,
-        adaptive,
-        output_data,
-        mask_data);
+    KernelMaxPool3DWithIdx<T1, T2>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 input_data,
+                                                 input_channels,
+                                                 input_depth,
+                                                 input_height,
+                                                 input_width,
+                                                 output_depth,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_depth,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_depth,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_depth,
+                                                 padding_height,
+                                                 padding_width,
+                                                 adaptive,
+                                                 output_data,
+                                                 mask_data);
   }
 };
 
@@ -2281,28 +2281,28 @@ class MaxPool3dWithIndexGradFunctor<phi::GPUContext, T1, T2> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelMaxPool3DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
-        nthreads,
-        output_grad_data,
-        mask_data,
-        input_channels,
-        input_depth,
-        input_height,
-        input_width,
-        output_depth,
-        output_height,
-        output_width,
-        ksize_depth,
-        ksize_height,
-        ksize_width,
-        stride_depth,
-        stride_height,
-        stride_width,
-        padding_depth,
-        padding_height,
-        padding_width,
-        adaptive,
-        input_grad_data);
+    KernelMaxPool3DWithIdxGrad<T1, T2>
+        <<<grid, threads, 0, context.stream()>>>(nthreads,
+                                                 output_grad_data,
+                                                 mask_data,
+                                                 input_channels,
+                                                 input_depth,
+                                                 input_height,
+                                                 input_width,
+                                                 output_depth,
+                                                 output_height,
+                                                 output_width,
+                                                 ksize_depth,
+                                                 ksize_height,
+                                                 ksize_width,
+                                                 stride_depth,
+                                                 stride_height,
+                                                 stride_width,
+                                                 padding_depth,
+                                                 padding_height,
+                                                 padding_width,
+                                                 adaptive,
+                                                 input_grad_data);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
index fa285dc69d1..0eebfc85685 100644
--- a/paddle/phi/kernels/funcs/pooling.h
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/platform/macros.h"  // import FLT_MAX
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index df14b0a21f2..5c74751b348 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -895,24 +895,20 @@ static void LaunchReduceKernel(const Tx* x_data,
     auto grid_num = config.grid;
     auto block_num = config.block;
 #endif
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<grid_num, block_num, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim,
-        is_mean && (!config.should_reduce_again));
+    ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp, OneDimIndexCal>
+        <<<grid_num, block_num, 0, stream>>>(
+            x_data,
+            config.output_data,
+            reducer,
+            transform,
+            init,
+            config.reduce_num,
+            config.left_num,
+            config.reduce_last_dim,
+            reduce_index_calculator,
+            left_index_calculator,
+            dim,
+            is_mean && (!config.should_reduce_again));
 
   } else {
     int reduce_rank = config.reduce_strides.size();
@@ -939,24 +935,20 @@ static void LaunchReduceKernel(const Tx* x_data,
     auto grid_num = config.grid;
     auto block_num = config.block;
 #endif
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<grid_num, block_num, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim,
-        is_mean && (!config.should_reduce_again));
+    ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp, IndexCalculator>
+        <<<grid_num, block_num, 0, stream>>>(
+            x_data,
+            config.output_data,
+            reducer,
+            transform,
+            init,
+            config.reduce_num,
+            config.left_num,
+            config.reduce_last_dim,
+            reduce_index_calculator,
+            left_index_calculator,
+            dim,
+            is_mean && (!config.should_reduce_again));
   }
 
   if (config.should_reduce_again) {
@@ -982,30 +974,31 @@ static void LaunchReduceKernel(const Tx* x_data,
     auto grid_size = grid;
     auto block_size = block;
 #endif
-    ReduceHigherDimKernel<
-        Ty,
-        Ty,
-        MPType,
-        ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<grid_size, block_size, 0, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim,
-        config.reduce_num,
-        is_mean);
+    ReduceHigherDimKernel<Ty,
+                          Ty,
+                          MPType,
+                          ReduceOp,
+                          kps::IdentityFunctor<Ty, MPType>>
+        <<<grid_size, block_size, 0, stream>>>(
+            config.output_data,
+            y_data,
+            reducer,
+            kps::IdentityFunctor<Ty, MPType>(),
+            init,
+            config.grid.y,
+            config.left_num,
+            config.grid.y,
+            dim,
+            config.reduce_num,
+            is_mean);
   }
 }
 
 #if !defined(PADDLE_WITH_XPU_KP)
 template <typename Tx,
           typename Ty,
-          template <typename> class ReduceOp,
+          template <typename>
+          class ReduceOp,
           typename TransformOp>
 static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value,
                                void>::type
@@ -1044,7 +1037,8 @@ CubTensorReduceImpl(const Tx* x_data,
 
 template <typename Tx,
           typename Ty,
-          template <typename> class ReduceOp,
+          template <typename>
+          class ReduceOp,
           typename TransformOp>
 static typename std::enable_if<std::is_same<Tx, phi::dtype::float16>::value,
                                void>::type
@@ -1061,7 +1055,8 @@ CubTensorReduceImpl(const Tx* x_data,
 
 template <typename Tx,
           typename Ty,
-          template <typename> class ReduceOp,
+          template <typename>
+          class ReduceOp,
           typename TransformOp>
 void ReduceKernel(const KPDevice& dev_ctx,
                   const phi::DenseTensor& x,
@@ -1146,22 +1141,19 @@ void ReduceKernel(const KPDevice& dev_ctx,
     auto grid_num = config.grid;
     auto block_num = config.block;
 #endif
-    ReduceHigherDimKernel<Tx,
-                          Ty,
-                          MPType,
-                          ReduceOp<MPType>,
-                          TransformOp><<<grid_num, block_num, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        reducer.initial(),
-        config.reduce_num,
-        config.left_num,
-        config.blocking_size,
-        dim,
-        config.reduce_num,
-        is_mean && (!config.should_reduce_again));
+    ReduceHigherDimKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>
+        <<<grid_num, block_num, 0, stream>>>(
+            x_data,
+            config.output_data,
+            reducer,
+            transform,
+            reducer.initial(),
+            config.reduce_num,
+            config.left_num,
+            config.blocking_size,
+            dim,
+            config.reduce_num,
+            is_mean && (!config.should_reduce_again));
 
     if (config.should_reduce_again) {
       dim3 block = dim3(config.block.x, 1, 1);
@@ -1177,24 +1169,23 @@ void ReduceKernel(const KPDevice& dev_ctx,
       auto grid_size = grid;
       auto block_size = block;
 #endif
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty,
-                               MPType>><<<grid_size, block_size, 0, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2,
-          config.reduce_num,
-          is_mean);
+      ReduceHigherDimKernel<Ty,
+                            Ty,
+                            MPType,
+                            ReduceOp<MPType>,
+                            kps::IdentityFunctor<Ty, MPType>>
+          <<<grid_size, block_size, 0, stream>>>(
+              config.output_data,
+              y_data,
+              reducer,
+              kps::IdentityFunctor<Ty, MPType>(config.grid.y),
+              reducer.initial(),
+              config.grid.y,
+              config.left_num,
+              config.grid.y,
+              dim2,
+              config.reduce_num,
+              is_mean);
     }
     return;
   }
diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
index 87083af3bc6..e10ae3951ae 100644
--- a/paddle/phi/kernels/funcs/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
@@ -232,14 +233,14 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx,
   dim3 grid = dim3((n + block - 1) / block);
   paddle::platform::LimitGridDim(ctx, &grid);
 
-  ScatterNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
-      p_update,
-      p_index,
-      p_output,
-      g_output_dims,
-      remain_numel,
-      slice_size,
-      end_size);
+  ScatterNdCUDAKernel<T, IndexT>
+      <<<grid, block, 0, ctx.stream()>>>(p_update,
+                                         p_index,
+                                         p_output,
+                                         g_output_dims,
+                                         remain_numel,
+                                         slice_size,
+                                         end_size);
 }
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/scatter.h b/paddle/phi/kernels/funcs/scatter.h
index 5d15c955a7f..0b381e57106 100644
--- a/paddle/phi/kernels/funcs/scatter.h
+++ b/paddle/phi/kernels/funcs/scatter.h
@@ -27,7 +27,7 @@ namespace phi {
 namespace funcs {
 
 /**
-  * Return the updated array pointer, use blas or eigen lib to optimize time
+ * Return the updated array pointer, use blas or eigen lib to optimize time
  * cost
  */
 template <typename T, typename IndexT = int>
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index 687cccb1f64..1012ca413ed 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/funcs/segment_pooling.h"
-
 #include <algorithm>
 
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -21,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
 
 namespace phi {
 namespace funcs {
@@ -281,18 +280,16 @@ void SegmentPoolCUDAGradFunctor(const phi::GPUContext& ctx,
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
   if (pooltype == "MAX" || pooltype == "MIN") {
-    SegmentIndexGradKernel<T,
-                           Index,
-                           ArrangeHelper<Index>><<<config.block_per_grid.x,
-                                                   config.thread_per_block.x,
-                                                   0,
-                                                   ctx.stream()>>>(
-        segment_ids.data<Index>(),
-        input.data<T>(),
-        output.data<T>(),
-        out_grad.data<T>(),
-        in_grad->data<T>(),
-        h);
+    SegmentIndexGradKernel<T, Index, ArrangeHelper<Index>>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           ctx.stream()>>>(segment_ids.data<Index>(),
+                           input.data<T>(),
+                           output.data<T>(),
+                           out_grad.data<T>(),
+                           in_grad->data<T>(),
+                           h);
   } else {
     PADDLE_THROW(phi::errors::InvalidArgument(
         "Unsupported segment pooling grad operation, Only MAX, MIN "
@@ -333,14 +330,14 @@ class SegmentPoolFunctor<phi::GPUContext, T, IndexT> {
           (input_length_size + DimTileSize - 1) / DimTileSize;
       auto config =
           phi::backends::gpu::GetGpuLaunchConfig1D(ctx, total_stripe_count);
-      SegmentSumIdsKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                                  config.thread_per_block.x,
-                                                  0,
-                                                  ctx.stream()>>>(
-          segment_ids.data<IndexT>(),
-          summed_ids->data<T>(),
-          input_length_size,
-          total_stripe_count);
+      SegmentSumIdsKernel<T, IndexT, IndexT(8)>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             ctx.stream()>>>(segment_ids.data<IndexT>(),
+                             summed_ids->data<T>(),
+                             input_length_size,
+                             total_stripe_count);
     }
 
     auto h = ArrangeHelper<IndexT>(
@@ -348,57 +345,51 @@ class SegmentPoolFunctor<phi::GPUContext, T, IndexT> {
     auto config =
         phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
     if (pooltype == "MEAN") {
-      SegmentMeanKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                                config.thread_per_block.x,
-                                                0,
-                                                ctx.stream()>>>(
-          segment_ids.data<IndexT>(),
-          input.data<T>(),
-          output->data<T>(),
-          summed_ids->data<T>(),
-          h.input_length_size,
-          h.inner_dim_size,
-          h.output_length_size,
-          h.total_stripe_count);
+      SegmentMeanKernel<T, IndexT, IndexT(8)>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             ctx.stream()>>>(segment_ids.data<IndexT>(),
+                             input.data<T>(),
+                             output->data<T>(),
+                             summed_ids->data<T>(),
+                             h.input_length_size,
+                             h.inner_dim_size,
+                             h.output_length_size,
+                             h.total_stripe_count);
     } else if (pooltype == "SUM") {
       SumPool<T> pool;
-      SegmentOpsKernel<T,
-                       IndexT,
-                       ArrangeHelper<IndexT>,
-                       SumPool<T>><<<config.block_per_grid.x,
-                                     config.thread_per_block.x,
-                                     0,
-                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                                     input.data<T>(),
-                                                     output->data<T>(),
-                                                     h,
-                                                     pool);
+      SegmentOpsKernel<T, IndexT, ArrangeHelper<IndexT>, SumPool<T>>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             ctx.stream()>>>(segment_ids.data<IndexT>(),
+                             input.data<T>(),
+                             output->data<T>(),
+                             h,
+                             pool);
     } else if (pooltype == "MAX") {
       MaxPool<T> pool;
-      SegmentOpsKernel<T,
-                       IndexT,
-                       ArrangeHelper<IndexT>,
-                       MaxPool<T>><<<config.block_per_grid.x,
-                                     config.thread_per_block.x,
-                                     0,
-                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                                     input.data<T>(),
-                                                     output->data<T>(),
-                                                     h,
-                                                     pool);
+      SegmentOpsKernel<T, IndexT, ArrangeHelper<IndexT>, MaxPool<T>>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             ctx.stream()>>>(segment_ids.data<IndexT>(),
+                             input.data<T>(),
+                             output->data<T>(),
+                             h,
+                             pool);
     } else if (pooltype == "MIN") {
       MinPool<T> pool;
-      SegmentOpsKernel<T,
-                       IndexT,
-                       ArrangeHelper<IndexT>,
-                       MinPool<T>><<<config.block_per_grid.x,
-                                     config.thread_per_block.x,
-                                     0,
-                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                                     input.data<T>(),
-                                                     output->data<T>(),
-                                                     h,
-                                                     pool);
+      SegmentOpsKernel<T, IndexT, ArrangeHelper<IndexT>, MinPool<T>>
+          <<<config.block_per_grid.x,
+             config.thread_per_block.x,
+             0,
+             ctx.stream()>>>(segment_ids.data<IndexT>(),
+                             input.data<T>(),
+                             output->data<T>(),
+                             h,
+                             pool);
     } else {
       PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
diff --git a/paddle/phi/kernels/funcs/segment_pooling.h b/paddle/phi/kernels/funcs/segment_pooling.h
index 09da9eb3047..5432330abc7 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.h
+++ b/paddle/phi/kernels/funcs/segment_pooling.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 193b9f614c9..a036f27cc2b 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -25,6 +25,7 @@ namespace cub = hipcub;
 #endif
 
 #include <algorithm>
+
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
@@ -39,12 +40,12 @@ namespace funcs {
 using Mode = kps::details::ReduceMode;
 
 /*
-* Count how many of the data being processed by the current block are true
-* 1. Load data from global memory and cast from bool to int64_t
-* 2. Get result of this thread according to thread reduce
-* 3. Get result of this block according to block reduce
-* 4. first block store 0 and current result
-*/
+ * Count how many of the data being processed by the current block are true
+ * 1. Load data from global memory and cast from bool to int64_t
+ * 2. Get result of this thread according to thread reduce
+ * 3. Get result of this block according to block reduce
+ * 4. first block store 0 and current result
+ */
 template <typename T>
 struct NonZeroFunctor {
   HOSTDEVICE NonZeroFunctor() {}
@@ -110,10 +111,10 @@ __global__ void GetBlockCountKernel(const InT *in,
 }
 
 /*
-* Get block num prefix us one block, VecSize must be 2
-* 1. Each thread load 2 data : threadIdx.x and threadIdx.x + blockDimx.x
-* 2. Cumsum limitation is blockDim.x must be less than 512
-*/
+ * Get block num prefix us one block, VecSize must be 2
+ * 1. Each thread load 2 data : threadIdx.x and threadIdx.x + blockDimx.x
+ * 2. Cumsum limitation is blockDim.x must be less than 512
+ */
 
 template <typename InT,
           typename OutT,
@@ -248,8 +249,8 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 2> {
 };
 
 /**
-* Get mask's index if mask == true
-*/
+ * Get mask's index if mask == true
+ */
 template <typename InT,
           typename MT,
           typename OutT,
@@ -257,14 +258,13 @@ template <typename InT,
           int VecSize,
           int MaskData,
           int IsBoundary>  // SelectType = 1 Mask_select else where_index
-__device__ void
-SelectKernelImpl(OutT *out,
-                 const MT *mask,
-                 const InT *in,
-                 Functor func,
-                 int num,
-                 int data_offset,
-                 int store_rank) {
+__device__ void SelectKernelImpl(OutT *out,
+                                 const MT *mask,
+                                 const InT *in,
+                                 Functor func,
+                                 int num,
+                                 int data_offset,
+                                 int store_rank) {
   const int kCVecSize = 2;
   // each thread cumsum 2 data
   using IdT = int64_t;
@@ -418,8 +418,8 @@ void SelectKernel(const KPDevice &dev_ctx,
   DenseTensor count_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
   CT *count_data = count_mem.data<CT>();
   // 1.3 launch CountKernl
-  GetBlockCountKernel<MT, CT, kVecSize><<<grid, block, 0, stream>>>(
-      cond_data, count_data, numel, main_offset);
+  GetBlockCountKernel<MT, CT, kVecSize>
+      <<<grid, block, 0, stream>>>(cond_data, count_data, numel, main_offset);
   // 2.1 alloc cumsum data for CoutBlock prefix
   DenseTensor cumsum_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
   CT *cumsum_data = cumsum_mem.data<CT>();
@@ -454,20 +454,15 @@ void SelectKernel(const KPDevice &dev_ctx,
   auto out_data = out->mutable_data<OutT>(cuda_place);
   // 3.2 get true data's index according to cond_data and cumsum_data
   if (total_true_num <= 0) return;
-  SelectKernel<MT,
-               InT,
-               CT,
-               OutT,
-               Functor,
-               kVecSize,
-               SelectData><<<grid, block, 0, stream>>>(out_data,
-                                                       cond_data,
-                                                       in_data_ptr,
-                                                       cumsum_data,
-                                                       func,
-                                                       numel,
-                                                       main_offset,
-                                                       rank);
+  SelectKernel<MT, InT, CT, OutT, Functor, kVecSize, SelectData>
+      <<<grid, block, 0, stream>>>(out_data,
+                                   cond_data,
+                                   in_data_ptr,
+                                   cumsum_data,
+                                   func,
+                                   numel,
+                                   main_offset,
+                                   rank);
 }
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/sequence2batch.h b/paddle/phi/kernels/funcs/sequence2batch.h
index e7c387fb99b..ed3a50d883d 100644
--- a/paddle/phi/kernels/funcs/sequence2batch.h
+++ b/paddle/phi/kernels/funcs/sequence2batch.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/phi/kernels/funcs/slice_utils.h b/paddle/phi/kernels/funcs/slice_utils.h
index 0c956248fd9..e26a6543789 100644
--- a/paddle/phi/kernels/funcs/slice_utils.h
+++ b/paddle/phi/kernels/funcs/slice_utils.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <paddle/phi/core/ddim.h>
+
 #include <string>
 #include <vector>
 
diff --git a/paddle/phi/kernels/funcs/sparse/flatten_indices.h b/paddle/phi/kernels/funcs/sparse/flatten_indices.h
index ca212e4366e..9a031b8cc12 100644
--- a/paddle/phi/kernels/funcs/sparse/flatten_indices.h
+++ b/paddle/phi/kernels/funcs/sparse/flatten_indices.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
 #include "paddle/phi/core/ddim.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index 9ed7cef12a1..b9568f1df71 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -27,7 +27,7 @@ namespace sparse {
  * rulebook_len: the length of rulebook
  * channels: the output channel size
  * out: the outputs
-**/
+ **/
 template <typename T>
 __global__ void ScatterKernel(const T* input,
                               const int* unique_value,
diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu
index 5c424316a83..1364a1cd3fa 100644
--- a/paddle/phi/kernels/gpu/abs_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_kernel.cu
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
index 5eecfce0932..d5b50800f53 100644
--- a/paddle/phi/kernels/gpu/accuracy_kernel.cu
+++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/accuracy_kernel.h"
-
 #include <thrust/execution_policy.h>
 #include <thrust/reduce.h>
+
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/accuracy_kernel.h"
 
 namespace phi {
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
@@ -94,15 +94,14 @@ void AccuracyRawKernel(const Context& dev_ctx,
     return;
   }
 
-  AccuracyCudaKernel<
-      PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-      num_samples,
-      infer_width,
-      indices_data,
-      label_data,
-      correct_data,
-      accuracy_data,
-      total_data);
+  AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS>
+      <<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(num_samples,
+                                                  infer_width,
+                                                  indices_data,
+                                                  label_data,
+                                                  correct_data,
+                                                  accuracy_data,
+                                                  total_data);
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index 1479fd49443..8d7b3e8dde6 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/activation_grad_kernel.h"
-
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/activation_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/impl/activation_grad_impl.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-
 namespace phi {
 
 template <typename T, typename Context, typename Functor>
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 8db31c5ed5b..05ec5dfb840 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -12,18 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/activation_kernel.h"
-
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/activation_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/impl/activation_grad_impl.h"
 #include "paddle/phi/kernels/impl/activation_impl.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-
 namespace phi {
 
 template <typename T, typename Context, typename Functor>
diff --git a/paddle/phi/kernels/gpu/adadelta_kernel.cu b/paddle/phi/kernels/gpu/adadelta_kernel.cu
index 7516a277a74..15c6ad18a96 100644
--- a/paddle/phi/kernels/gpu/adadelta_kernel.cu
+++ b/paddle/phi/kernels/gpu/adadelta_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/adadelta_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/adadelta_kernel.h"
 #include "paddle/phi/kernels/impl/adadelta_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/gpu/adagrad_kernel.cu b/paddle/phi/kernels/gpu/adagrad_kernel.cu
index 0e037eb808c..381dde09be7 100644
--- a/paddle/phi/kernels/gpu/adagrad_kernel.cu
+++ b/paddle/phi/kernels/gpu/adagrad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/adagrad_kernel.h"
-
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/adagrad_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/impl/adagrad_kernel_impl.h"
 
@@ -106,19 +105,18 @@ struct SparseAdagradFunctor<phi::GPUContext, T> {
     dim3 threads(block_size, 1);
     dim3 grid2(1, merge_rows.size());
     paddle::framework::MixVector<int64_t> mixv_merge_rows(&merge_rows);
-    SparseAdagradFunctorKernel<
-        T,
-        256><<<grid2,
-               threads,
-               0,
-               reinterpret_cast<const phi::GPUContext&>(context).stream()>>>(
-        grad_merge_data,
-        mixv_merge_rows.CUDAMutableData(context.GetPlace()),
-        lr,
-        param_data,
-        moment_data,
-        grad_width,
-        epsilon);
+    SparseAdagradFunctorKernel<T, 256>
+        <<<grid2,
+           threads,
+           0,
+           reinterpret_cast<const phi::GPUContext&>(context).stream()>>>(
+            grad_merge_data,
+            mixv_merge_rows.CUDAMutableData(context.GetPlace()),
+            lr,
+            param_data,
+            moment_data,
+            grad_width,
+            epsilon);
     mixv_merge_rows.CopyToCPU();
   }
 };
diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu
index 449aaae1a4b..edeeb64f5db 100644
--- a/paddle/phi/kernels/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/gpu/adam_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/adam_kernel.h"
-
 #include <math.h>  // for sqrt in CPU and CUDA
+
 #include <vector>
 
 #include "paddle/fluid/framework/tensor_util.h"
@@ -23,6 +22,7 @@
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/adam_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/paddle/phi/kernels/gpu/adamax_kernel.cu b/paddle/phi/kernels/gpu/adamax_kernel.cu
index 0817c531318..d9171c80fa9 100644
--- a/paddle/phi/kernels/gpu/adamax_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamax_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/adamax_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/adamax_kernel.h"
 #include "paddle/phi/kernels/impl/adamax_kernel_impl.h"
 
 PD_REGISTER_KERNEL(adamax, GPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu
index 0fff142567a..7c00cd13d6b 100644
--- a/paddle/phi/kernels/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamw_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/adamw_kernel.h"
-
 #include <math.h>  // for sqrt in CPU and CUDA
+
 #include <vector>
 
 #include "paddle/fluid/framework/tensor_util.h"
@@ -23,6 +22,7 @@
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/adamw_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu
index 87636631a9b..d109ec49cfd 100644
--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -12,15 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/add_n_kernel.h"
-
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/add_n_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/memory/memcpy.h"
-
 namespace phi {
 
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
diff --git a/paddle/phi/kernels/gpu/addmm_grad_kernel.cu b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
index 65978da1374..be71f619aa1 100644
--- a/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/addmm_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
 #include "paddle/phi/kernels/impl/addmm_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/gpu/addmm_kernel.cu b/paddle/phi/kernels/gpu/addmm_kernel.cu
index 7b589ce20ac..e30d664068e 100644
--- a/paddle/phi/kernels/gpu/addmm_kernel.cu
+++ b/paddle/phi/kernels/gpu/addmm_kernel.cu
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/addmm_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/addmm_kernel.h"
 #include "paddle/phi/kernels/impl/addmm_kernel_impl.h"
 
 PD_REGISTER_KERNEL(addmm, GPU, ALL_LAYOUT, phi::AddmmKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
index 8abc6b272c5..cfa809f60c2 100644
--- a/paddle/phi/kernels/gpu/allclose_kernel.cu
+++ b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/allclose_kernel.h"
-
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/allclose_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/arange_kernel.cu b/paddle/phi/kernels/gpu/arange_kernel.cu
index 9ea0d7c5393..99456147e5f 100644
--- a/paddle/phi/kernels/gpu/arange_kernel.cu
+++ b/paddle/phi/kernels/gpu/arange_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/arange_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/arange_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/range_function.h"
 
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index 385ddb5e521..101e01df521 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/arg_min_max_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 
@@ -27,6 +26,7 @@
 namespace cub = hipcub;
 #endif
 #include <limits>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -121,33 +121,27 @@ void ComputeFullArg(const phi::GPUContext& dev_ctx,
 
   if (typeid(Reducer) == typeid(cub::ArgMax)) {
     switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T,
-                        IndType,
-                        Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height,
-              width,
-              post,
-              Reducer(),
-              std::numeric_limits<T>::lowest(),
-              in_data,
-              out_data));
+      FIXED_BLOCK_DIM_CASE(ArgCUDAKernel<T, IndType, Reducer, kBlockDim>
+                           <<<grid_size, kBlockDim, 0, cu_stream>>>(
+                               height,
+                               width,
+                               post,
+                               Reducer(),
+                               std::numeric_limits<T>::lowest(),
+                               in_data,
+                               out_data));
     }
   } else {
     switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T,
-                        IndType,
-                        Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height,
-              width,
-              post,
-              Reducer(),
-              std::numeric_limits<T>::max(),
-              in_data,
-              out_data));
+      FIXED_BLOCK_DIM_CASE(ArgCUDAKernel<T, IndType, Reducer, kBlockDim>
+                           <<<grid_size, kBlockDim, 0, cu_stream>>>(
+                               height,
+                               width,
+                               post,
+                               Reducer(),
+                               std::numeric_limits<T>::max(),
+                               in_data,
+                               out_data));
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
index 15bca474f58..a2d149cb2e4 100644
--- a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/argsort_kernel.h"
-
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
+
+#include "paddle/phi/kernels/argsort_kernel.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
index 6a9c1e27599..6e8d47d9a57 100644
--- a/paddle/phi/kernels/gpu/argsort_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/argsort_kernel.h"
-
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
+
+#include "paddle/phi/kernels/argsort_kernel.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
diff --git a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
index 6652d242de5..7e68610af1d 100644
--- a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/atan2_kernel.cu b/paddle/phi/kernels/gpu/atan2_kernel.cu
index dd0bba177de..887c11c7e7f 100644
--- a/paddle/phi/kernels/gpu/atan2_kernel.cu
+++ b/paddle/phi/kernels/gpu/atan2_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu
index 5a1bb9874fe..ef55fb9a5e3 100644
--- a/paddle/phi/kernels/gpu/auc_kernel.cu
+++ b/paddle/phi/kernels/gpu/auc_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/auc_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/auc_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index c08fa4eb260..6de239182c1 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -12,21 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/operators/norm_utils.cu.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/flags.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
-
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
-#include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
-
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#include "paddle/fluid/platform/flags.h"
 #include "paddle/phi/kernels/gpu/batch_norm_utils.h"
 
 #ifdef __HIPCC__
@@ -595,8 +591,8 @@ void BatchNormGradRawKernel(const Context &ctx,
               /*dBnScaleBiasDesc=*/bn_param_desc_,
               /*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
               /*bnBiasData=*/nullptr,
-              /*dBnScaleData=*/ctx.template Alloc<BatchNormParamType<T>>(
-                  d_scale),
+              /*dBnScaleData=*/
+              ctx.template Alloc<BatchNormParamType<T>>(d_scale),
               /*dBnBiasData=*/ctx.template Alloc<BatchNormParamType<T>>(d_bias),
               /*epsilon=*/epsilon,
               /*savedMean=*/saved_mean_data,
@@ -604,44 +600,42 @@ void BatchNormGradRawKernel(const Context &ctx,
               /*activationDesc=*/nullptr,
               /*workspace=*/workspace_ptr,
               /*workSpaceSizeInBytes=*/workspace_size,
-              /*reserveSpace=*/const_cast<uint8_t *>(
-                  reserve_space->template data<uint8_t>()),
+              /*reserveSpace=*/
+              const_cast<uint8_t *>(reserve_space->template data<uint8_t>()),
               /*reserveSpaceSizeInBytes=*/reserve_space_size));
 #endif  // CUDNN_VERSION_MIN(7, 4, 1)
       if (!called) {
 #ifdef PADDLE_WITH_HIP
         if (compute_format == DataLayout::kNCHW) {
-          BNBackward<T,
-                     block,
-                     DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
-              transformed_d_y.template data<T>(),
-              transformed_x.template data<T>(),
-              scale.template data<BatchNormParamType<T>>(),
-              saved_mean_data,
-              saved_var_data,
-              C,
-              N,
-              H * W * D,
-              epsilon,
-              transformed_d_x.template data<T>(),
-              ctx.template Alloc<BatchNormParamType<T>>(d_scale),
-              ctx.template Alloc<BatchNormParamType<T>>(d_bias));
+          BNBackward<T, block, DataLayout::kNCHW>
+              <<<grid2, block, 0, ctx.stream()>>>(
+                  transformed_d_y.template data<T>(),
+                  transformed_x.template data<T>(),
+                  scale.template data<BatchNormParamType<T>>(),
+                  saved_mean_data,
+                  saved_var_data,
+                  C,
+                  N,
+                  H * W * D,
+                  epsilon,
+                  transformed_d_x.template data<T>(),
+                  ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+                  ctx.template Alloc<BatchNormParamType<T>>(d_bias));
         } else {
-          BNBackward<T,
-                     block,
-                     DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
-              transformed_d_y.template data<T>(),
-              transformed_x.template data<T>(),
-              scale.template data<BatchNormParamType<T>>(),
-              saved_mean_data,
-              saved_var_data,
-              C,
-              N,
-              H * W * D,
-              epsilon,
-              transformed_d_x.template data<T>(),
-              ctx.template Alloc<BatchNormParamType<T>>(d_scale),
-              ctx.template Alloc<BatchNormParamType<T>>(d_bias));
+          BNBackward<T, block, DataLayout::kNHWC>
+              <<<grid2, block, 0, ctx.stream()>>>(
+                  transformed_d_y.template data<T>(),
+                  transformed_x.template data<T>(),
+                  scale.template data<BatchNormParamType<T>>(),
+                  saved_mean_data,
+                  saved_var_data,
+                  C,
+                  N,
+                  H * W * D,
+                  epsilon,
+                  transformed_d_x.template data<T>(),
+                  ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+                  ctx.template Alloc<BatchNormParamType<T>>(d_bias));
         }
 
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
@@ -693,67 +687,59 @@ void BatchNormGradRawKernel(const Context &ctx,
       // This branch call CUDA kernels
       if (compute_format == DataLayout::kNCHW) {
         if (d_x) {
-          BNBackwardData<
-              T,
-              block,
-              phi::DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
-              d_y->data<T>(),
-              scale.data<BatchNormParamType<T>>(),
-              saved_mean_data,
-              x.data<T>(),
-              saved_var_data,
-              C,
-              N,
-              H * W * D,
-              d_x->data<T>());
+          BNBackwardData<T, block, phi::DataLayout::kNCHW>
+              <<<grid2, block, 0, ctx.stream()>>>(
+                  d_y->data<T>(),
+                  scale.data<BatchNormParamType<T>>(),
+                  saved_mean_data,
+                  x.data<T>(),
+                  saved_var_data,
+                  C,
+                  N,
+                  H * W * D,
+                  d_x->data<T>());
         }
         if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T,
-              block,
-              phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(),
-              x.data<T>(),
-              saved_mean_data,
-              saved_var_data,
-              epsilon,
-              N,
-              C,
-              H * W * D,
-              d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
+          KeBNBackwardScaleBias<T, block, phi::DataLayout::kNCHW>
+              <<<grid2, block, 0, stream>>>(
+                  d_y->data<T>(),
+                  x.data<T>(),
+                  saved_mean_data,
+                  saved_var_data,
+                  epsilon,
+                  N,
+                  C,
+                  H * W * D,
+                  d_scale->data<BatchNormParamType<T>>(),
+                  d_bias->data<BatchNormParamType<T>>());
         }
       } else {
         if (d_x) {
-          BNBackwardData<
-              T,
-              block,
-              phi::DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
-              d_y->data<T>(),
-              scale.data<BatchNormParamType<T>>(),
-              saved_mean_data,
-              x.data<T>(),
-              saved_var_data,
-              C,
-              N,
-              H * W * D,
-              d_x->data<T>());
+          BNBackwardData<T, block, phi::DataLayout::kNHWC>
+              <<<grid2, block, 0, ctx.stream()>>>(
+                  d_y->data<T>(),
+                  scale.data<BatchNormParamType<T>>(),
+                  saved_mean_data,
+                  x.data<T>(),
+                  saved_var_data,
+                  C,
+                  N,
+                  H * W * D,
+                  d_x->data<T>());
         }
         if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T,
-              block,
-              phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(),
-              x.data<T>(),
-              saved_mean_data,
-              saved_var_data,
-              epsilon,
-              N,
-              C,
-              H * W * D,
-              d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
+          KeBNBackwardScaleBias<T, block, phi::DataLayout::kNHWC>
+              <<<grid2, block, 0, stream>>>(
+                  d_y->data<T>(),
+                  x.data<T>(),
+                  saved_mean_data,
+                  saved_var_data,
+                  epsilon,
+                  N,
+                  C,
+                  H * W * D,
+                  d_scale->data<BatchNormParamType<T>>(),
+                  d_bias->data<BatchNormParamType<T>>());
         }
       }
     }
@@ -802,61 +788,55 @@ void BatchNormGradRawKernel(const Context &ctx,
 
     if (compute_format == DataLayout::kNCHW) {
       if (d_x) {
-        KeBNBackwardData<T,
-                         phi::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
-            d_y->data<T>(),
-            scale.data<BatchNormParamType<T>>(),
-            running_var_data,
-            epsilon,
-            C,
-            H * W,
-            num,
-            d_x->data<T>());
+        KeBNBackwardData<T, phi::DataLayout::kNCHW>
+            <<<grid1, block, 0, stream>>>(d_y->data<T>(),
+                                          scale.data<BatchNormParamType<T>>(),
+                                          running_var_data,
+                                          epsilon,
+                                          C,
+                                          H * W,
+                                          num,
+                                          d_x->data<T>());
       }
       if (d_scale && d_bias) {
-        KeBNBackwardScaleBias<
-            T,
-            block,
-            phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-            d_y->data<T>(),
-            x.data<T>(),
-            running_mean_data,
-            running_var_data,
-            epsilon,
-            N,
-            C,
-            H * W * D,
-            d_scale->data<BatchNormParamType<T>>(),
-            d_bias->data<BatchNormParamType<T>>());
+        KeBNBackwardScaleBias<T, block, phi::DataLayout::kNCHW>
+            <<<grid2, block, 0, stream>>>(
+                d_y->data<T>(),
+                x.data<T>(),
+                running_mean_data,
+                running_var_data,
+                epsilon,
+                N,
+                C,
+                H * W * D,
+                d_scale->data<BatchNormParamType<T>>(),
+                d_bias->data<BatchNormParamType<T>>());
       }
     } else {
       if (d_x) {
-        KeBNBackwardData<T,
-                         phi::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
-            d_y->data<T>(),
-            scale.data<BatchNormParamType<T>>(),
-            running_var_data,
-            epsilon,
-            C,
-            H * W,
-            num,
-            d_x->data<T>());
+        KeBNBackwardData<T, phi::DataLayout::kNHWC>
+            <<<grid1, block, 0, stream>>>(d_y->data<T>(),
+                                          scale.data<BatchNormParamType<T>>(),
+                                          running_var_data,
+                                          epsilon,
+                                          C,
+                                          H * W,
+                                          num,
+                                          d_x->data<T>());
       }
       if (d_scale && d_bias) {
-        KeBNBackwardScaleBias<
-            T,
-            block,
-            phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-            d_y->data<T>(),
-            x.data<T>(),
-            running_mean_data,
-            running_var_data,
-            epsilon,
-            N,
-            C,
-            H * W * D,
-            d_scale->data<BatchNormParamType<T>>(),
-            d_bias->data<BatchNormParamType<T>>());
+        KeBNBackwardScaleBias<T, block, phi::DataLayout::kNHWC>
+            <<<grid2, block, 0, stream>>>(
+                d_y->data<T>(),
+                x.data<T>(),
+                running_mean_data,
+                running_var_data,
+                epsilon,
+                N,
+                C,
+                H * W * D,
+                d_scale->data<BatchNormParamType<T>>(),
+                d_bias->data<BatchNormParamType<T>>());
       }
     }
   }
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index e2aeec72362..26b28d50186 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -20,20 +20,17 @@
 namespace cub = hipcub;
 #endif
 
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/flags.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
-
-#include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
-
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#include "paddle/fluid/platform/flags.h"
 #include "paddle/phi/kernels/gpu/batch_norm_utils.h"
 
 #ifdef __HIPCC__
@@ -353,33 +350,31 @@ void BatchNormKernel(const Context &ctx,
     const int block_size = 256;
     const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
     if (compute_format == DataLayout::kNCHW) {
-      BNForwardInference<
-          T,
-          DataLayout::kNCHW><<<grid_size, block_size, 0, ctx.stream()>>>(
-          transformed_x.template data<T>(),
-          est_mean->template data<BatchNormParamType<T>>(),
-          est_var->template data<BatchNormParamType<T>>(),
-          scale.template data<BatchNormParamType<T>>(),
-          bias.template data<BatchNormParamType<T>>(),
-          C,
-          N,
-          H * W * D,
-          epsilon,
-          transformed_y.template data<T>());
+      BNForwardInference<T, DataLayout::kNCHW>
+          <<<grid_size, block_size, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              est_mean->template data<BatchNormParamType<T>>(),
+              est_var->template data<BatchNormParamType<T>>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_y.template data<T>());
     } else {
-      BNForwardInference<
-          T,
-          DataLayout::kNHWC><<<grid_size, block_size, 0, ctx.stream()>>>(
-          transformed_x.template data<T>(),
-          est_mean->template data<BatchNormParamType<T>>(),
-          est_var->template data<BatchNormParamType<T>>(),
-          scale.template data<BatchNormParamType<T>>(),
-          bias.template data<BatchNormParamType<T>>(),
-          C,
-          N,
-          H * W * D,
-          epsilon,
-          transformed_y.template data<T>());
+      BNForwardInference<T, DataLayout::kNHWC>
+          <<<grid_size, block_size, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              est_mean->template data<BatchNormParamType<T>>(),
+              est_var->template data<BatchNormParamType<T>>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_y.template data<T>());
     }
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // PADDLE_ENFORCE_GPU_SUCCESS(
@@ -539,41 +534,37 @@ void BatchNormKernel(const Context &ctx,
         const int max_blocks = std::max(max_threads / block, 1);
         const int grid = std::min(C, max_blocks);
         if (compute_format == DataLayout::kNCHW) {
-          BNForwardTraining<
-              T,
-              block,
-              DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
-              transformed_x.template data<T>(),
-              scale.template data<BatchNormParamType<T>>(),
-              bias.template data<BatchNormParamType<T>>(),
-              C,
-              N,
-              H * W * D,
-              epsilon,
-              this_factor,
-              transformed_y.template data<T>(),
-              mean_out->template data<BatchNormParamType<T>>(),
-              variance_out->template data<BatchNormParamType<T>>(),
-              saved_mean->template data<BatchNormParamType<T>>(),
-              saved_variance->template data<BatchNormParamType<T>>());
+          BNForwardTraining<T, block, DataLayout::kNCHW>
+              <<<grid, block, 0, ctx.stream()>>>(
+                  transformed_x.template data<T>(),
+                  scale.template data<BatchNormParamType<T>>(),
+                  bias.template data<BatchNormParamType<T>>(),
+                  C,
+                  N,
+                  H * W * D,
+                  epsilon,
+                  this_factor,
+                  transformed_y.template data<T>(),
+                  mean_out->template data<BatchNormParamType<T>>(),
+                  variance_out->template data<BatchNormParamType<T>>(),
+                  saved_mean->template data<BatchNormParamType<T>>(),
+                  saved_variance->template data<BatchNormParamType<T>>());
         } else {
-          BNForwardTraining<
-              T,
-              block,
-              DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
-              transformed_x.template data<T>(),
-              scale.template data<BatchNormParamType<T>>(),
-              bias.template data<BatchNormParamType<T>>(),
-              C,
-              N,
-              H * W * D,
-              epsilon,
-              this_factor,
-              transformed_y.template data<T>(),
-              mean_out->template data<BatchNormParamType<T>>(),
-              variance_out->template data<BatchNormParamType<T>>(),
-              saved_mean->template data<BatchNormParamType<T>>(),
-              saved_variance->template data<BatchNormParamType<T>>());
+          BNForwardTraining<T, block, DataLayout::kNHWC>
+              <<<grid, block, 0, ctx.stream()>>>(
+                  transformed_x.template data<T>(),
+                  scale.template data<BatchNormParamType<T>>(),
+                  bias.template data<BatchNormParamType<T>>(),
+                  C,
+                  N,
+                  H * W * D,
+                  epsilon,
+                  this_factor,
+                  transformed_y.template data<T>(),
+                  mean_out->template data<BatchNormParamType<T>>(),
+                  variance_out->template data<BatchNormParamType<T>>(),
+                  saved_mean->template data<BatchNormParamType<T>>(),
+                  saved_variance->template data<BatchNormParamType<T>>());
         }
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
index 94eabac4d13..b9f1680726d 100644
--- a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/bce_loss_grad_kernel.h"
-
 #include <algorithm>
 #include <vector>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/bce_loss_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/bce_loss_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
index b190bce4742..bc2d278049c 100644
--- a/paddle/phi/kernels/gpu/bce_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/bce_loss_kernel.h"
-
 #include <algorithm>
 #include <vector>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/bce_loss_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 
diff --git a/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu b/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu
index f4f69ee83ee..6186e90d54c 100644
--- a/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h"
 #include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-
 PD_REGISTER_KERNEL(bilinear_tensor_product_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu b/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu
index b81b842cedb..b23e9ccfcc8 100644
--- a/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu
+++ b/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/bilinear_tensor_product_kernel.h"
 #include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-
 PD_REGISTER_KERNEL(bilinear_tensor_product,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu
index 8e60b31c370..257c708ee53 100644
--- a/paddle/phi/kernels/gpu/bincount_kernel.cu
+++ b/paddle/phi/kernels/gpu/bincount_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/bincount_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/bincount_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -108,11 +107,9 @@ void BincountCUDAInner(const Context& dev_ctx,
     int64_t* output_data = dev_ctx.template Alloc<int64_t>(output);
     phi::funcs::SetConstant<Context, int64_t>()(dev_ctx, output, 0L);
 
-    KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
-                                         PADDLE_CUDA_NUM_THREADS,
-                                         0,
-                                         stream>>>(
-        input_data, input_numel, has_weights, weights_data, output_data);
+    KernelBincount<T, InputT, int64_t>
+        <<<GET_BLOCKS(input_numel), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+            input_data, input_numel, has_weights, weights_data, output_data);
   } else {
     const auto& weights_type =
         paddle::framework::TransToProtoVarType(weights->dtype());
@@ -122,20 +119,16 @@ void BincountCUDAInner(const Context& dev_ctx,
       phi::funcs::SetConstant<Context, float>()(
           dev_ctx, output, static_cast<float>(0));
 
-      KernelBincount<T, InputT, float><<<GET_BLOCKS(input_numel),
-                                         PADDLE_CUDA_NUM_THREADS,
-                                         0,
-                                         stream>>>(
-          input_data, input_numel, has_weights, weights_data, output_data);
+      KernelBincount<T, InputT, float>
+          <<<GET_BLOCKS(input_numel), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+              input_data, input_numel, has_weights, weights_data, output_data);
     } else {
       double* output_data = dev_ctx.template Alloc<double>(output);
       phi::funcs::SetConstant<Context, double>()(
           dev_ctx, output, static_cast<double>(0));
-      KernelBincount<T, InputT, double><<<GET_BLOCKS(input_numel),
-                                          PADDLE_CUDA_NUM_THREADS,
-                                          0,
-                                          stream>>>(
-          input_data, input_numel, has_weights, weights_data, output_data);
+      KernelBincount<T, InputT, double>
+          <<<GET_BLOCKS(input_numel), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+              input_data, input_numel, has_weights, weights_data, output_data);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index d4850b74477..eb70ef9ee76 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
-
 #include <vector>
+
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
index aa45bd3c438..5c87c9fc907 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
-#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
-
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
 
 PD_REGISTER_KERNEL(broadcast_tensors,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cast_grad_kernel.cu b/paddle/phi/kernels/gpu/cast_grad_kernel.cu
index f4b61030158..0029e6e954e 100644
--- a/paddle/phi/kernels/gpu/cast_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cast_grad_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/cast_grad_kernel.h"
 #include "paddle/phi/kernels/gpu/cast_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index a879dc3bafd..b2b42482ad3 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cast_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/gpu/cast_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/channel_shuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/channel_shuffle_grad_kernel.cu
index 63d3d4a554f..5fc67daf44a 100644
--- a/paddle/phi/kernels/gpu/channel_shuffle_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/channel_shuffle_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/channel_shuffle_grad_kernel.h"
-#include "paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/channel_shuffle_grad_kernel.h"
+#include "paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(channel_shuffle_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/channel_shuffle_kernel.cu b/paddle/phi/kernels/gpu/channel_shuffle_kernel.cu
index f85cb4aafd1..adecd2f9600 100644
--- a/paddle/phi/kernels/gpu/channel_shuffle_kernel.cu
+++ b/paddle/phi/kernels/gpu/channel_shuffle_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/channel_shuffle_kernel.h"
-#include "paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/channel_shuffle_kernel.h"
+#include "paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h"
 
 PD_REGISTER_KERNEL(channel_shuffle,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
index 9165e8ea414..0cff8de5294 100644
--- a/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cholesky_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
 #include "paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
index 22ea87d83e8..ec9ac7545d2 100644
--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -15,16 +15,17 @@ limitations under the License. */
 #ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
-#include "paddle/phi/kernels/cholesky_kernel.h"
-
 #include <thrust/device_vector.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cholesky_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
index 82b1282cc36..9be20c80252 100644
--- a/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
@@ -15,10 +15,9 @@
 #ifndef PADDLE_WITH_HIP
 // backward reuse forward, HIP not support forward
 
-#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(cholesky_solve_grad,  // cuda_only
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
index f1c91f38247..f74f4bf3814 100644
--- a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -15,14 +15,13 @@
 #ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
-#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
-
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+#include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/clip_grad_kernel.cu b/paddle/phi/kernels/gpu/clip_grad_kernel.cu
index 4566e8468ec..bab7dd41aee 100644
--- a/paddle/phi/kernels/gpu/clip_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/clip_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/clip_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/clip_grad_kernel.h"
 #include "paddle/phi/kernels/impl/clip_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(clip_grad,
diff --git a/paddle/phi/kernels/gpu/clip_kernel.cu b/paddle/phi/kernels/gpu/clip_kernel.cu
index 9e0050db7fd..9295b8b37a0 100644
--- a/paddle/phi/kernels/gpu/clip_kernel.cu
+++ b/paddle/phi/kernels/gpu/clip_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/clip_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/clip_kernel.h"
 #include "paddle/phi/kernels/impl/clip_kernel_impl.h"
 
 PD_REGISTER_KERNEL(clip,
diff --git a/paddle/phi/kernels/gpu/complex_grad_kernel.cu b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
index ad694445d18..b1a4c984eed 100644
--- a/paddle/phi/kernels/gpu/complex_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/complex_grad_kernel.h"
-#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
-
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/complex_grad_kernel.h"
+#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(imag_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index e03e079581a..ae53aa95102 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/impl/complex_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/impl/complex_kernel_impl.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/complex.h"
diff --git a/paddle/phi/kernels/gpu/concat_grad_kernel.cu b/paddle/phi/kernels/gpu/concat_grad_kernel.cu
index 2445978daca..6b980c1a033 100644
--- a/paddle/phi/kernels/gpu/concat_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_grad_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/concat_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/concat_grad_kernel.h"
 #include "paddle/phi/kernels/impl/concat_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(concat_grad,
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index accb1cc3d77..9582110c621 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/concat_kernel.h"
-
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
@@ -22,6 +20,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/lod_utils.h"
+#include "paddle/phi/kernels/concat_kernel.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
diff --git a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
index 6449a193a08..f2669ebe04b 100644
--- a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
-#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
diff --git a/paddle/phi/kernels/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_grad_kernel.cu
index 677ec4a0620..5fae327c2a9 100644
--- a/paddle/phi/kernels/gpu/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/conv_grad_kernel.h"
-#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/conv_kernel.cu b/paddle/phi/kernels/gpu/conv_kernel.cu
index 680ee4426af..3aa406af4c2 100644
--- a/paddle/phi/kernels/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/conv_kernel.h"
-#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
index e583e13650a..ee140a529b7 100644
--- a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
@@ -12,15 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
-#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
-
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/depthwise_conv.h"
+#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
index b7d34a5baf3..4f8aae09a73 100644
--- a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
@@ -12,15 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/conv_transpose_kernel.h"
-#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
-
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/depthwise_conv.h"
+#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu
index 16eff5b26e3..c917ce75489 100644
--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/copy_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/malloc.h"
diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
index c66daf4fe64..94d91cbcbbd 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -22,6 +22,10 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
@@ -31,11 +35,6 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
-#include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
 namespace phi {
 
 template <typename T>
@@ -195,19 +194,19 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
       logits_grad_2d.Resize({n, d});
       int grid = (n * remain + block - 1) / block;
       const auto* label_data = label.data<LabelT>();
-      HardLabelCrossEntropyGradientKernel<T,
-                                          LabelT><<<grid, block, 0, stream>>>(
-          logit_grad_data, label_data, n, d, remain, ignore_index);
+      HardLabelCrossEntropyGradientKernel<T, LabelT>
+          <<<grid, block, 0, stream>>>(
+              logit_grad_data, label_data, n, d, remain, ignore_index);
       int num = n * d;
       grid = (num + block - 1) / block;
-      ScaleCrossEntropyGradient<T, LabelT><<<grid, block, 0, stream>>>(
-          logit_grad_data,
-          loss_grad_data,
-          num,
-          d,
-          remain,
-          label_data,
-          ignore_index);
+      ScaleCrossEntropyGradient<T, LabelT>
+          <<<grid, block, 0, stream>>>(logit_grad_data,
+                                       loss_grad_data,
+                                       num,
+                                       d,
+                                       remain,
+                                       label_data,
+                                       ignore_index);
     }
 
     return;
@@ -224,15 +223,15 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
     const T* softmax_data = softmax.data<T>();
     const auto* label_data = label.data<LabelT>();
     int grid = (n * d + block - 1) / block;
-    SoftmaxWithCrossEntropyGradHardLabel<T><<<grid, block, 0, stream>>>(
-        logit_grad_data,
-        loss_grad_data,
-        softmax_data,
-        label_data,
-        n,
-        d / remain,
-        remain,
-        ignore_index);
+    SoftmaxWithCrossEntropyGradHardLabel<T>
+        <<<grid, block, 0, stream>>>(logit_grad_data,
+                                     loss_grad_data,
+                                     softmax_data,
+                                     label_data,
+                                     n,
+                                     d / remain,
+                                     remain,
+                                     ignore_index);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 1908c780604..75a4658ee7d 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -22,6 +22,10 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
@@ -31,11 +35,6 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
-#include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
 namespace phi {
 
 #define ALIGN_BYTES 16
@@ -704,13 +703,11 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss,
   }
 }
 
-#define SOFTMAX_WARP_FORWARD_SOFT_CASE(Log2Elements, VecT, AccT)               \
-  case Log2Elements:                                                           \
-    WarpSoftmaxForwardSoftLabel<T,                                             \
-                                VecT,                                          \
-                                AccT,                                          \
-                                Log2Elements><<<blocks, threads, 0, stream>>>( \
-        loss, softmax, src, label, batch_size, stride, element_count);         \
+#define SOFTMAX_WARP_FORWARD_SOFT_CASE(Log2Elements, VecT, AccT)           \
+  case Log2Elements:                                                       \
+    WarpSoftmaxForwardSoftLabel<T, VecT, AccT, Log2Elements>               \
+        <<<blocks, threads, 0, stream>>>(                                  \
+            loss, softmax, src, label, batch_size, stride, element_count); \
     break;
 
 /*
@@ -1104,23 +1101,17 @@ __global__ void WarpSoftmaxForward(T* loss,
   }
 }
 
-#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, LabelT, VecT, AccT)  \
-  case Log2Elements:                                                 \
-    WarpSoftmaxForward<T,                                            \
-                       LabelT,                                       \
-                       VecT,                                         \
-                       AccT,                                         \
-                       Log2Elements,                                 \
-                       mode,                                         \
-                       IgnoreIndex><<<blocks, threads, 0, stream>>>( \
-        loss,                                                        \
-        softmax,                                                     \
-        src,                                                         \
-        label,                                                       \
-        batch_size,                                                  \
-        stride,                                                      \
-        element_count,                                               \
-        ignore_index);                                               \
+#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, LabelT, VecT, AccT)            \
+  case Log2Elements:                                                           \
+    WarpSoftmaxForward<T, LabelT, VecT, AccT, Log2Elements, mode, IgnoreIndex> \
+        <<<blocks, threads, 0, stream>>>(loss,                                 \
+                                         softmax,                              \
+                                         src,                                  \
+                                         label,                                \
+                                         batch_size,                           \
+                                         stride,                               \
+                                         element_count,                        \
+                                         ignore_index);                        \
     break;
 
 /*
@@ -1189,12 +1180,9 @@ void LaunchVectorizedSoftmaxForward(T* loss,
   block_size = std::max(block_size, kps::details::kWarpSize);
   dim3 grids(high_dim);
   dim3 blocks(block_size);
-  VectorizedSoftmaxForward<T,
-                           AccT,
-                           LabelT,
-                           vec_size,
-                           IgnoreIndex><<<grids, blocks, 0, stream>>>(
-      loss, softmax, logits, label, high_dim, mid_dim, ignore_index);
+  VectorizedSoftmaxForward<T, AccT, LabelT, vec_size, IgnoreIndex>
+      <<<grids, blocks, 0, stream>>>(
+          loss, softmax, logits, label, high_dim, mid_dim, ignore_index);
 }
 
 /*
@@ -1281,10 +1269,9 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
     int threads = 128;
     int blocks = (N * dim * D + threads - 1) / threads;
     // compute cross entropy, input is log softmax
-    CrossEntropyExpHardLabel<T,
-                             LabelT,
-                             IgnoreIndex><<<blocks, threads, 0, stream>>>(
-        loss_data, softmax_data, labels_data, N, dim, D, ignore_index);
+    CrossEntropyExpHardLabel<T, LabelT, IgnoreIndex>
+        <<<blocks, threads, 0, stream>>>(
+            loss_data, softmax_data, labels_data, N, dim, D, ignore_index);
   }
 }
 
@@ -1366,44 +1353,38 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
       int blocks = (n * d + kBatchPerBlock - 1) / kBatchPerBlock;
       dim3 threads(kThreadPerBlock / kBatchPerBlock, kBatchPerBlock, 1);
 
-      CrossEntropySoftLabel<T,
-                            T,
-                            false><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          loss_data,
-          NULL,
-          logits_data,
-          labels_data,
-          n,
-          axis_dim,
-          d / axis_dim,
-          kDimLog2);
+      CrossEntropySoftLabel<T, T, false>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(loss_data,
+                                                     NULL,
+                                                     logits_data,
+                                                     labels_data,
+                                                     n,
+                                                     axis_dim,
+                                                     d / axis_dim,
+                                                     kDimLog2);
     } else {  // HardLabel
       auto* logits_data = softmax->data<T>();
       auto* labels_data = labels.data<LabelT>();
       int threads = 128;
       int blocks = (n * d / axis_dim + threads - 1) / threads;
       if (ignore_index >= 0 && ignore_index < axis_dim) {
-        CrossEntropyHardLabel<T,
-                              LabelT,
-                              true><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            loss_data,
-            logits_data,
-            labels_data,
-            n,
-            axis_dim,
-            d / axis_dim,
-            ignore_index);
+        CrossEntropyHardLabel<T, LabelT, true>
+            <<<blocks, threads, 0, dev_ctx.stream()>>>(loss_data,
+                                                       logits_data,
+                                                       labels_data,
+                                                       n,
+                                                       axis_dim,
+                                                       d / axis_dim,
+                                                       ignore_index);
       } else {
-        CrossEntropyHardLabel<T,
-                              LabelT,
-                              false><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            loss_data,
-            logits_data,
-            labels_data,
-            n,
-            axis_dim,
-            d / axis_dim,
-            ignore_index);
+        CrossEntropyHardLabel<T, LabelT, false>
+            <<<blocks, threads, 0, dev_ctx.stream()>>>(loss_data,
+                                                       logits_data,
+                                                       labels_data,
+                                                       n,
+                                                       axis_dim,
+                                                       d / axis_dim,
+                                                       ignore_index);
       }
     }
 
diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
index 1bb0d42dad8..1f83f05f81c 100644
--- a/paddle/phi/kernels/gpu/cross_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cross_grad_kernel.h"
-#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cross_grad_kernel.h"
+#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(cross_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu
index aa944f82916..4f3e5f0ca8c 100644
--- a/paddle/phi/kernels/gpu/cross_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cross_kernel.h"
-#include "paddle/phi/kernels/impl/cross_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cross_kernel.h"
+#include "paddle/phi/kernels/impl/cross_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     cross, GPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
index 6e871246292..bbae4fd130c 100644
--- a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cumprod_grad_kernel.h"
-
 #include <thrust/transform.h>
+
 #include "paddle/fluid/operators/math/inclusive_scan.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cumprod_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/cumprod.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
diff --git a/paddle/phi/kernels/gpu/cumprod_kernel.cu b/paddle/phi/kernels/gpu/cumprod_kernel.cu
index 1bbf8972a24..86aef50ac32 100644
--- a/paddle/phi/kernels/gpu/cumprod_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumprod_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cumprod_kernel.h"
-
 #include "paddle/fluid/operators/math/inclusive_scan.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cumprod_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/cumprod.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu
index 13975ddd3ef..ed131e0ff54 100644
--- a/paddle/phi/kernels/gpu/cumsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/cumsum_kernel.h"
-
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
+
+#include "paddle/phi/kernels/cumsum_kernel.h"
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
@@ -302,13 +302,13 @@ void CumsumKernel(const Context& dev_ctx,
         out_data, in_data, outer_size, inner_size, scan_size, exclusive);
 
   } else {
-    BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
-        next_out_data,
-        next_in_data,
-        outer_size,
-        inner_size,
-        scan_size,
-        exclusive);
+    BlockScanKernel<T, 128, 4>
+        <<<scan_grid, 128, 0, dev_ctx.stream()>>>(next_out_data,
+                                                  next_in_data,
+                                                  outer_size,
+                                                  inner_size,
+                                                  scan_size,
+                                                  exclusive);
   }
   swap_ptr(next_in_data, next_out_data);
   if (reverse) {
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index 265d123dfea..d80a4b8cc4c 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/deformable_conv_grad_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/deformable_conv_grad_kernel.h"
 #include "paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h"
 
 namespace phi {
@@ -69,10 +68,9 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
     int w_in = w_out * stride_w - pad_w;
     int h_in = h_out * stride_h - pad_h;
 
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
+    const T* data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const int data_offset_h_ptr =
         ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
     const int data_offset_w_ptr =
@@ -86,9 +84,9 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
 
     T cur_top_grad = data_col[thread];
     if (data_mask) {
-      const T* data_mask_ptr = data_mask +
-                               (b * deformable_group + deformable_group_index) *
-                                   kernel_h * kernel_w * height_col * width_col;
+      const T* data_mask_ptr =
+          data_mask + (b * deformable_group + deformable_group_index) *
+                          kernel_h * kernel_w * height_col * width_col;
       const T mask = data_mask_ptr[data_mask_hw_ptr];
       cur_top_grad *= mask;
     }
@@ -134,28 +132,28 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
   int blocks = NumBlocks(num_kernels);
   int threads = kNumCUDAThreads;
 
-  ModulatedDeformableCol2imGpuKernel<
-      T><<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
-                                                   data_col,
-                                                   data_offset,
-                                                   data_mask,
-                                                   im_shape[0],
-                                                   im_shape[1],
-                                                   im_shape[2],
-                                                   kernel_shape[2],
-                                                   kernel_shape[3],
-                                                   pad[0],
-                                                   pad[1],
-                                                   stride[0],
-                                                   stride[1],
-                                                   dilation[0],
-                                                   dilation[1],
-                                                   channel_per_deformable_group,
-                                                   col_shape[1],
-                                                   deformable_group,
-                                                   col_shape[2],
-                                                   col_shape[3],
-                                                   grad_im);
+  ModulatedDeformableCol2imGpuKernel<T>
+      <<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
+                                                 data_col,
+                                                 data_offset,
+                                                 data_mask,
+                                                 im_shape[0],
+                                                 im_shape[1],
+                                                 im_shape[2],
+                                                 kernel_shape[2],
+                                                 kernel_shape[3],
+                                                 pad[0],
+                                                 pad[1],
+                                                 stride[0],
+                                                 stride[1],
+                                                 dilation[0],
+                                                 dilation[1],
+                                                 channel_per_deformable_group,
+                                                 col_shape[1],
+                                                 deformable_group,
+                                                 col_shape[2],
+                                                 col_shape[3],
+                                                 grad_im);
 }
 
 template <typename T>
@@ -196,23 +194,20 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
     const int deformable_group_index = c / (2 * kernel_h * kernel_w);
     const int col_step = kernel_h * kernel_w;
     int cnt = 0;
-    const T* data_col_ptr = data_col +
-                            deformable_group_index *
-                                channel_per_deformable_group * batch_size *
-                                width_col * height_col;
-    const T* data_im_ptr = data_im +
-                           (b * deformable_group + deformable_group_index) *
-                               channel_per_deformable_group / kernel_h /
-                               kernel_w * height * width;
-    const T* data_offset_ptr = data_offset +
-                               (b * deformable_group + deformable_group_index) *
-                                   2 * kernel_h * kernel_w * height_col *
-                                   width_col;
+    const T* data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T* data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T* data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
     const T* data_mask_ptr =
         data_mask
-            ? data_mask +
-                  (b * deformable_group + deformable_group_index) * kernel_h *
-                      kernel_w * height_col * width_col
+            ? data_mask + (b * deformable_group + deformable_group_index) *
+                              kernel_h * kernel_w * height_col * width_col
             : nullptr;
 
     const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
@@ -301,32 +296,32 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
   int blocks = NumBlocks(num_kernels);
   int threads = kNumCUDAThreads;
 
-  ModulatedDeformableCol2imCoordGpuKernel<
-      T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-      num_kernels,
-      data_col,
-      data_im,
-      data_offset,
-      data_mask,
-      im_shape[0],
-      im_shape[1],
-      im_shape[2],
-      kernel_shape[2],
-      kernel_shape[3],
-      paddings[0],
-      paddings[1],
-      strides[0],
-      strides[1],
-      dilations[0],
-      dilations[1],
-      channel_per_deformable_group,
-      col_shape[1],
-      2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
-      deformable_groups,
-      col_shape[2],
-      col_shape[3],
-      grad_offset,
-      grad_mask);
+  ModulatedDeformableCol2imCoordGpuKernel<T>
+      <<<blocks, threads, 0, dev_ctx.stream()>>>(
+          num_kernels,
+          data_col,
+          data_im,
+          data_offset,
+          data_mask,
+          im_shape[0],
+          im_shape[1],
+          im_shape[2],
+          kernel_shape[2],
+          kernel_shape[3],
+          paddings[0],
+          paddings[1],
+          strides[0],
+          strides[1],
+          dilations[0],
+          dilations[1],
+          channel_per_deformable_group,
+          col_shape[1],
+          2 * kernel_shape[2] * kernel_shape[3] * deformable_groups,
+          deformable_groups,
+          col_shape[2],
+          col_shape[3],
+          grad_offset,
+          grad_mask);
 }
 
 template <typename T>
@@ -351,9 +346,9 @@ void FilterGradAddup(const Context& dev_ctx,
                      const int width,
                      const T* dweight_3d,
                      T* filter_grad) {
-  FilterGradAddupGpuKernel<
-      T><<<NumBlocks(nthreads), kNumCUDAThreads, 0, dev_ctx.stream()>>>(
-      nthreads, n, height, width, dweight_3d, filter_grad);
+  FilterGradAddupGpuKernel<T>
+      <<<NumBlocks(nthreads), kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+          nthreads, n, height, width, dweight_3d, filter_grad);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
index 2476dcbafb9..17a7b3265ca 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/deformable_conv_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/deformable_conv_kernel.h"
 #include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
 
 PD_REGISTER_KERNEL(deformable_conv,
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
index 5270a4b2fdb..8586c56c560 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/hostdevice.h"
@@ -1249,73 +1250,71 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
 #endif
     int grid_size = (nums_output + block_size - 1) / block_size;
 
-#define check_case(c_filter_multiplier, c_stride, c_filter)               \
-  if (c_filter_multiplier == 0 ||                                         \
-      filter_multiplier == c_filter_multiplier &&                         \
-          stride_height == stride_width && stride_height == c_stride &&   \
-          (ksize_height == ksize_width && ksize_height == c_filter ||     \
-           c_filter == -1)) {                                             \
-    if (c_filter == -1) {                                                 \
-      threads.x = block_size;                                             \
-      grid.x = grid_size;                                                 \
-      threads.y = threads.z = grid.y = grid.z = 1;                        \
-    }                                                                     \
-    if (data_layout != DataLayout::kNHWC) {                               \
-      KernelDepthwiseConvSp<                                              \
-          T,                                                              \
-          c_filter_multiplier,                                            \
-          c_stride,                                                       \
-          c_filter,                                                       \
-          DataLayout::kNCHW,                                              \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data,                                                     \
-          filter_data,                                                    \
-          batch_size,                                                     \
-          output_channels,                                                \
-          output_height,                                                  \
-          output_width,                                                   \
-          input_channels,                                                 \
-          input_height,                                                   \
-          input_width,                                                    \
-          filter_multiplier,                                              \
-          ksize_height,                                                   \
-          ksize_width,                                                    \
-          stride_height,                                                  \
-          stride_width,                                                   \
-          padding_height,                                                 \
-          padding_width,                                                  \
-          dilate_height,                                                  \
-          dilate_width,                                                   \
-          output_data);                                                   \
-    } else {                                                              \
-      KernelDepthwiseConvSp<                                              \
-          T,                                                              \
-          c_filter_multiplier,                                            \
-          c_stride,                                                       \
-          c_filter,                                                       \
-          DataLayout::kNHWC,                                              \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data,                                                     \
-          filter_data,                                                    \
-          batch_size,                                                     \
-          output_channels,                                                \
-          output_height,                                                  \
-          output_width,                                                   \
-          input_channels,                                                 \
-          input_height,                                                   \
-          input_width,                                                    \
-          filter_multiplier,                                              \
-          ksize_height,                                                   \
-          ksize_width,                                                    \
-          stride_height,                                                  \
-          stride_width,                                                   \
-          padding_height,                                                 \
-          padding_width,                                                  \
-          dilate_height,                                                  \
-          dilate_width,                                                   \
-          output_data);                                                   \
-    }                                                                     \
-    return;                                                               \
+#define check_case(c_filter_multiplier, c_stride, c_filter)             \
+  if (c_filter_multiplier == 0 ||                                       \
+      filter_multiplier == c_filter_multiplier &&                       \
+          stride_height == stride_width && stride_height == c_stride && \
+          (ksize_height == ksize_width && ksize_height == c_filter ||   \
+           c_filter == -1)) {                                           \
+    if (c_filter == -1) {                                               \
+      threads.x = block_size;                                           \
+      grid.x = grid_size;                                               \
+      threads.y = threads.z = grid.y = grid.z = 1;                      \
+    }                                                                   \
+    if (data_layout != DataLayout::kNHWC) {                             \
+      KernelDepthwiseConvSp<T,                                          \
+                            c_filter_multiplier,                        \
+                            c_stride,                                   \
+                            c_filter,                                   \
+                            DataLayout::kNCHW,                          \
+                            fuse_relu_before_conv>                      \
+          <<<grid, threads, 0, context.stream()>>>(input_data,          \
+                                                   filter_data,         \
+                                                   batch_size,          \
+                                                   output_channels,     \
+                                                   output_height,       \
+                                                   output_width,        \
+                                                   input_channels,      \
+                                                   input_height,        \
+                                                   input_width,         \
+                                                   filter_multiplier,   \
+                                                   ksize_height,        \
+                                                   ksize_width,         \
+                                                   stride_height,       \
+                                                   stride_width,        \
+                                                   padding_height,      \
+                                                   padding_width,       \
+                                                   dilate_height,       \
+                                                   dilate_width,        \
+                                                   output_data);        \
+    } else {                                                            \
+      KernelDepthwiseConvSp<T,                                          \
+                            c_filter_multiplier,                        \
+                            c_stride,                                   \
+                            c_filter,                                   \
+                            DataLayout::kNHWC,                          \
+                            fuse_relu_before_conv>                      \
+          <<<grid, threads, 0, context.stream()>>>(input_data,          \
+                                                   filter_data,         \
+                                                   batch_size,          \
+                                                   output_channels,     \
+                                                   output_height,       \
+                                                   output_width,        \
+                                                   input_channels,      \
+                                                   input_height,        \
+                                                   input_width,         \
+                                                   filter_multiplier,   \
+                                                   ksize_height,        \
+                                                   ksize_width,         \
+                                                   stride_height,       \
+                                                   stride_width,        \
+                                                   padding_height,      \
+                                                   padding_width,       \
+                                                   dilate_height,       \
+                                                   dilate_width,        \
+                                                   output_data);        \
+    }                                                                   \
+    return;                                                             \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -1417,70 +1416,68 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
     }
     int filter_multiplier = output_channels / input_channels;
 
-#define check_case(c_filter_multiplier, c_stride, c_filter)               \
-  if (c_filter_multiplier == 0 ||                                         \
-      filter_multiplier == c_filter_multiplier &&                         \
-          stride_height == stride_width && stride_height == c_stride &&   \
-          (ksize_height == ksize_width && ksize_height == c_filter ||     \
-           c_filter == -1)) {                                             \
-    if (data_layout != DataLayout::kNHWC) {                               \
-      KernelDepthwiseConvInputGradSp<                                     \
-          T,                                                              \
-          c_filter_multiplier,                                            \
-          c_stride,                                                       \
-          c_filter,                                                       \
-          DataLayout::kNCHW,                                              \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data,                                                     \
-          output_grad_data,                                               \
-          filter_data,                                                    \
-          batch_size,                                                     \
-          output_channels,                                                \
-          output_height,                                                  \
-          output_width,                                                   \
-          input_channels,                                                 \
-          input_height,                                                   \
-          input_width,                                                    \
-          filter_multiplier,                                              \
-          ksize_height,                                                   \
-          ksize_width,                                                    \
-          stride_height,                                                  \
-          stride_width,                                                   \
-          padding_height,                                                 \
-          padding_width,                                                  \
-          dilate_height,                                                  \
-          dilate_width,                                                   \
-          input_grad_data);                                               \
-    } else {                                                              \
-      KernelDepthwiseConvInputGradSp<                                     \
-          T,                                                              \
-          c_filter_multiplier,                                            \
-          c_stride,                                                       \
-          c_filter,                                                       \
-          DataLayout::kNHWC,                                              \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data,                                                     \
-          output_grad_data,                                               \
-          filter_data,                                                    \
-          batch_size,                                                     \
-          output_channels,                                                \
-          output_height,                                                  \
-          output_width,                                                   \
-          input_channels,                                                 \
-          input_height,                                                   \
-          input_width,                                                    \
-          filter_multiplier,                                              \
-          ksize_height,                                                   \
-          ksize_width,                                                    \
-          stride_height,                                                  \
-          stride_width,                                                   \
-          padding_height,                                                 \
-          padding_width,                                                  \
-          dilate_height,                                                  \
-          dilate_width,                                                   \
-          input_grad_data);                                               \
-    }                                                                     \
-    return;                                                               \
+#define check_case(c_filter_multiplier, c_stride, c_filter)             \
+  if (c_filter_multiplier == 0 ||                                       \
+      filter_multiplier == c_filter_multiplier &&                       \
+          stride_height == stride_width && stride_height == c_stride && \
+          (ksize_height == ksize_width && ksize_height == c_filter ||   \
+           c_filter == -1)) {                                           \
+    if (data_layout != DataLayout::kNHWC) {                             \
+      KernelDepthwiseConvInputGradSp<T,                                 \
+                                     c_filter_multiplier,               \
+                                     c_stride,                          \
+                                     c_filter,                          \
+                                     DataLayout::kNCHW,                 \
+                                     fuse_relu_before_conv>             \
+          <<<grid, threads, 0, context.stream()>>>(input_data,          \
+                                                   output_grad_data,    \
+                                                   filter_data,         \
+                                                   batch_size,          \
+                                                   output_channels,     \
+                                                   output_height,       \
+                                                   output_width,        \
+                                                   input_channels,      \
+                                                   input_height,        \
+                                                   input_width,         \
+                                                   filter_multiplier,   \
+                                                   ksize_height,        \
+                                                   ksize_width,         \
+                                                   stride_height,       \
+                                                   stride_width,        \
+                                                   padding_height,      \
+                                                   padding_width,       \
+                                                   dilate_height,       \
+                                                   dilate_width,        \
+                                                   input_grad_data);    \
+    } else {                                                            \
+      KernelDepthwiseConvInputGradSp<T,                                 \
+                                     c_filter_multiplier,               \
+                                     c_stride,                          \
+                                     c_filter,                          \
+                                     DataLayout::kNHWC,                 \
+                                     fuse_relu_before_conv>             \
+          <<<grid, threads, 0, context.stream()>>>(input_data,          \
+                                                   output_grad_data,    \
+                                                   filter_data,         \
+                                                   batch_size,          \
+                                                   output_channels,     \
+                                                   output_height,       \
+                                                   output_width,        \
+                                                   input_channels,      \
+                                                   input_height,        \
+                                                   input_width,         \
+                                                   filter_multiplier,   \
+                                                   ksize_height,        \
+                                                   ksize_width,         \
+                                                   stride_height,       \
+                                                   stride_width,        \
+                                                   padding_height,      \
+                                                   padding_width,       \
+                                                   dilate_height,       \
+                                                   dilate_width,        \
+                                                   input_grad_data);    \
+    }                                                                   \
+    return;                                                             \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -1574,32 +1571,31 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
           (ksize_height == ksize_width && ksize_height == c_filter ||          \
            c_filter == -1)) {                                                  \
     if (data_layout != DataLayout::kNHWC) {                                    \
-      KernelDepthwiseConvFilterGradSp<                                         \
-          T,                                                                   \
-          c_filter_multiplier,                                                 \
-          c_stride,                                                            \
-          c_filter,                                                            \
-          DataLayout::kNCHW,                                                   \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          output_grad_data,                                                    \
-          input_data,                                                          \
-          batch_size,                                                          \
-          output_channels,                                                     \
-          output_height,                                                       \
-          output_width,                                                        \
-          input_channels,                                                      \
-          input_height,                                                        \
-          input_width,                                                         \
-          filter_multiplier,                                                   \
-          ksize_height,                                                        \
-          ksize_width,                                                         \
-          stride_height,                                                       \
-          stride_width,                                                        \
-          padding_height,                                                      \
-          padding_width,                                                       \
-          dilate_height,                                                       \
-          dilate_width,                                                        \
-          filter_grad_data);                                                   \
+      KernelDepthwiseConvFilterGradSp<T,                                       \
+                                      c_filter_multiplier,                     \
+                                      c_stride,                                \
+                                      c_filter,                                \
+                                      DataLayout::kNCHW,                       \
+                                      fuse_relu_before_conv>                   \
+          <<<grid, threads, 0, context.stream()>>>(output_grad_data,           \
+                                                   input_data,                 \
+                                                   batch_size,                 \
+                                                   output_channels,            \
+                                                   output_height,              \
+                                                   output_width,               \
+                                                   input_channels,             \
+                                                   input_height,               \
+                                                   input_width,                \
+                                                   filter_multiplier,          \
+                                                   ksize_height,               \
+                                                   ksize_width,                \
+                                                   stride_height,              \
+                                                   stride_width,               \
+                                                   padding_height,             \
+                                                   padding_width,              \
+                                                   dilate_height,              \
+                                                   dilate_width,               \
+                                                   filter_grad_data);          \
     } else {                                                                   \
       framework::Tensor filter_grad_hwc;                                       \
       if (c_filter != -1) {                                                    \
@@ -1624,32 +1620,31 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
         grid = dim3(ksize_width * ksize_height, output_height, batch_size);    \
         threads = dim3(std::min(output_channels, block_size), blocks, 1);      \
       }                                                                        \
-      KernelDepthwiseConvFilterGradSp<                                         \
-          T,                                                                   \
-          c_filter_multiplier,                                                 \
-          c_stride,                                                            \
-          c_filter,                                                            \
-          DataLayout::kNHWC,                                                   \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          output_grad_data,                                                    \
-          input_data,                                                          \
-          batch_size,                                                          \
-          output_channels,                                                     \
-          output_height,                                                       \
-          output_width,                                                        \
-          input_channels,                                                      \
-          input_height,                                                        \
-          input_width,                                                         \
-          filter_multiplier,                                                   \
-          ksize_height,                                                        \
-          ksize_width,                                                         \
-          stride_height,                                                       \
-          stride_width,                                                        \
-          padding_height,                                                      \
-          padding_width,                                                       \
-          dilate_height,                                                       \
-          dilate_width,                                                        \
-          filter_grad_data);                                                   \
+      KernelDepthwiseConvFilterGradSp<T,                                       \
+                                      c_filter_multiplier,                     \
+                                      c_stride,                                \
+                                      c_filter,                                \
+                                      DataLayout::kNHWC,                       \
+                                      fuse_relu_before_conv>                   \
+          <<<grid, threads, 0, context.stream()>>>(output_grad_data,           \
+                                                   input_data,                 \
+                                                   batch_size,                 \
+                                                   output_channels,            \
+                                                   output_height,              \
+                                                   output_width,               \
+                                                   input_channels,             \
+                                                   input_height,               \
+                                                   input_width,                \
+                                                   filter_multiplier,          \
+                                                   ksize_height,               \
+                                                   ksize_width,                \
+                                                   stride_height,              \
+                                                   stride_width,               \
+                                                   padding_height,             \
+                                                   padding_width,              \
+                                                   dilate_height,              \
+                                                   dilate_width,               \
+                                                   filter_grad_data);          \
       if (c_filter != -1) {                                                    \
         std::vector<int> perm_axis({2, 3, 0, 1});                              \
         phi::funcs::TransposeNormal<phi::GPUContext, T> trans;                 \
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
index c50ceae33fc..7310883e595 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
@@ -12,15 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/operators/conv_op.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/operators/conv_op.h"
-
-#include "paddle/phi/kernels/gpu/depthwise_conv.h"
-
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu
index cce12a87fac..267a3b5e3fa 100644
--- a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/determinant_grad_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
 #include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(determinant_grad,
diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu
index 25184083873..b2dddf1fdb8 100644
--- a/paddle/phi/kernels/gpu/determinant_kernel.cu
+++ b/paddle/phi/kernels/gpu/determinant_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/determinant_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/determinant_kernel.h"
 #include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/gpu/diag_grad_kernel.cu b/paddle/phi/kernels/gpu/diag_grad_kernel.cu
index 65bf837e6cf..5a579ecc27b 100644
--- a/paddle/phi/kernels/gpu/diag_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/diag_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/diag_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/diag_kernel.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -85,16 +84,16 @@ void DiagGradKernel(const Context& dev_ctx,
           (offset >= 0 ? offset * dout_stride_1 : -offset * dout_stride_0);
 
       std::tuple<int64_t, int64_t> block_grid_size = GetBlockGridSize(size);
-      ExtractDiagonalKernel<T><<<std::get<1>(block_grid_size),
-                                 std::get<0>(block_grid_size),
-                                 0,
-                                 dev_ctx.stream()>>>(
-          dout_data,
-          dx_data,
-          start,
-          dx_length,
-          dout_stride_0 + dout_stride_1,
-          dx_stride);
+      ExtractDiagonalKernel<T>
+          <<<std::get<1>(block_grid_size),
+             std::get<0>(block_grid_size),
+             0,
+             dev_ctx.stream()>>>(dout_data,
+                                 dx_data,
+                                 start,
+                                 dx_length,
+                                 dout_stride_0 + dout_stride_1,
+                                 dx_stride);
     }
   } else {
     phi::funcs::SetConstant<Context, T> set_padding_value;
diff --git a/paddle/phi/kernels/gpu/diag_kernel.cu b/paddle/phi/kernels/gpu/diag_kernel.cu
index 95d3d3365d9..fd63084ecb3 100644
--- a/paddle/phi/kernels/gpu/diag_kernel.cu
+++ b/paddle/phi/kernels/gpu/diag_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/diag_kernel.h"
-
 #include <algorithm>
 #include <tuple>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/diag_kernel.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/gpu/dist_grad_kernel.cu b/paddle/phi/kernels/gpu/dist_grad_kernel.cu
index c458f8cce3e..cdfec8d4afd 100644
--- a/paddle/phi/kernels/gpu/dist_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dist_grad_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/dist_grad_kernel.h"
-#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dist_grad_kernel.h"
+#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
 
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
index 87e75e02754..d5ed1721612 100644
--- a/paddle/phi/kernels/gpu/dist_kernel.cu
+++ b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/dist_kernel.h"
-#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dist_kernel.h"
+#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
 
 #ifdef PADDLE_WITH_HIP
 // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
diff --git a/paddle/phi/kernels/gpu/dot_grad_kernel.cu b/paddle/phi/kernels/gpu/dot_grad_kernel.cu
index 7defc0304e5..c299e11b1f9 100644
--- a/paddle/phi/kernels/gpu/dot_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dot_grad_kernel.cu
@@ -12,13 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/dot_grad_kernel.h"
-#include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dot_grad_kernel.h"
+#include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(dot_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/dot_kernel.cu b/paddle/phi/kernels/gpu/dot_kernel.cu
index 4442396f6c9..f947ef310eb 100644
--- a/paddle/phi/kernels/gpu/dot_kernel.cu
+++ b/paddle/phi/kernels/gpu/dot_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/dot_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dot_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
diff --git a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
index 94d4942a418..b27029fe863 100644
--- a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
@@ -13,10 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/dropout_impl.cu.h"
-#include "paddle/phi/kernels/dropout_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dropout_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu
index fae0e8cb25b..8ae3dd25cc8 100644
--- a/paddle/phi/kernels/gpu/dropout_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_kernel.cu
@@ -13,10 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/dropout_impl.cu.h"
-#include "paddle/phi/kernels/dropout_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dropout_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
index 5e33966055e..51ff66b4a9c 100644
--- a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/eigh_grad_kernel.h"
-#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/eigh_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(eigh_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/eigh_kernel.cu b/paddle/phi/kernels/gpu/eigh_kernel.cu
index 4ff3b371b6a..8c3ebf5f12b 100644
--- a/paddle/phi/kernels/gpu/eigh_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigh_kernel.cu
@@ -15,11 +15,10 @@
 #ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
-#include "paddle/phi/kernels/eigh_kernel.h"
-#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/eigh_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
index 950f811475c..a8464be3bb3 100644
--- a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/einsum_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/einsum_kernel.h"
 #include "paddle/phi/kernels/impl/einsum_grad_impl.h"
 
 PD_REGISTER_KERNEL(einsum_grad,
diff --git a/paddle/phi/kernels/gpu/einsum_kernel.cu b/paddle/phi/kernels/gpu/einsum_kernel.cu
index d1f4c659038..08853890826 100644
--- a/paddle/phi/kernels/gpu/einsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/einsum_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/einsum_kernel.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 
 PD_REGISTER_KERNEL(einsum,
diff --git a/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
index 517fbcba158..4739d8cb334 100644
--- a/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
index 57bf6da4060..5ef0752d6c8 100644
--- a/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index e5432b5f918..9c1ced3c1bd 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -216,13 +216,13 @@ void ElementwiseAddGrad(const GPUContext &ctx,
         dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
                  PREDEFINED_BLOCK_SIZE,
              1);
-    SimpleElemwiseAddGradCUDAKernel<
-        T><<<grid_size, block_size, 0, ctx.stream()>>>(
-        dout.data<T>(),
-        size,
-        vec_size,
-        dx->mutable_data<T>(ctx.GetPlace()),
-        dy->mutable_data<T>(ctx.GetPlace()));
+    SimpleElemwiseAddGradCUDAKernel<T>
+        <<<grid_size, block_size, 0, ctx.stream()>>>(
+            dout.data<T>(),
+            size,
+            vec_size,
+            dx->mutable_data<T>(ctx.GetPlace()),
+            dy->mutable_data<T>(ctx.GetPlace()));
   } else {
     VLOG(4) << "Special case when dy_data is the same as dout_data, "
                "and dx_data is the same as dout_data, do not need "
@@ -291,9 +291,12 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
         auto size = dy->numel();
         dim3 grid_size =
             dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
-        SimpleElemwiseSubGradCUDAKernel<
-            T><<<grid_size, block_size, 0, ctx.stream()>>>(
-            dout.data<T>(), size, nullptr, dy->mutable_data<T>(ctx.GetPlace()));
+        SimpleElemwiseSubGradCUDAKernel<T>
+            <<<grid_size, block_size, 0, ctx.stream()>>>(
+                dout.data<T>(),
+                size,
+                nullptr,
+                dy->mutable_data<T>(ctx.GetPlace()));
       }
     } else {
       std::vector<int> reduce_dims =
@@ -316,12 +319,12 @@ void elementwise_sub_grad(const GPUContext &ctx,
   auto size = x.numel();
   dim3 grid_size =
       dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
-  SimpleElemwiseSubGradCUDAKernel<
-      T><<<grid_size, block_size, 0, ctx.stream()>>>(
-      dout.data<T>(),
-      size,
-      dx->mutable_data<T>(ctx.GetPlace()),
-      dy->mutable_data<T>(ctx.GetPlace()));
+  SimpleElemwiseSubGradCUDAKernel<T>
+      <<<grid_size, block_size, 0, ctx.stream()>>>(
+          dout.data<T>(),
+          size,
+          dx->mutable_data<T>(ctx.GetPlace()),
+          dy->mutable_data<T>(ctx.GetPlace()));
 }
 /*
 ******************************
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 3e7430fd84e..8dc0917fef5 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/elementwise_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
index 3442d7f0285..0201854e533 100644
--- a/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
index 45e19b98384..2edf7a132ed 100644
--- a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/elementwise_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index 47b1b304f5e..d01674719ed 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/embedding_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/embedding_util.h"
-
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/embedding_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
-
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/embedding_kernel.cu b/paddle/phi/kernels/gpu/embedding_kernel.cu
index 14a40abefff..c0516d00899 100644
--- a/paddle/phi/kernels/gpu/embedding_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/embedding_kernel.h"
-#include "paddle/phi/kernels/funcs/embedding_util.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/embedding_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
index 50fbfddf043..078632fb4e6 100644
--- a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/erfinv_grad_kernel.h"
-#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erfinv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     erfinv_grad, GPU, ALL_LAYOUT, phi::ErfinvGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/erfinv_kernel.cu b/paddle/phi/kernels/gpu/erfinv_kernel.cu
index 10df0bdf560..3296bb3dbb1 100644
--- a/paddle/phi/kernels/gpu/erfinv_kernel.cu
+++ b/paddle/phi/kernels/gpu/erfinv_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/erfinv_kernel.h"
-#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/erfinv_kernel.h"
+#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
 
 PD_REGISTER_KERNEL(erfinv, GPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
index 273851cfd8b..387708af05b 100644
--- a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/expand_as_grad_kernel.h"
-#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/expand_as_grad_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(expand_as_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/expand_as_kernel.cu b/paddle/phi/kernels/gpu/expand_as_kernel.cu
index 0972eebeabf..68e683127ba 100644
--- a/paddle/phi/kernels/gpu/expand_as_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_as_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/expand_as_kernel.h"
-#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/expand_as_kernel.h"
+#include "paddle/phi/kernels/impl/expand_as_kernel_impl.h"
 
 PD_REGISTER_KERNEL(expand_as,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/eye_kernel.cu b/paddle/phi/kernels/gpu/eye_kernel.cu
index 069310b0d15..d720feb9030 100644
--- a/paddle/phi/kernels/gpu/eye_kernel.cu
+++ b/paddle/phi/kernels/gpu/eye_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/eye_kernel.h"
-#include "paddle/phi/kernels/impl/eye_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/eye_kernel.h"
+#include "paddle/phi/kernels/impl/eye_kernel_impl.h"
 
 PD_REGISTER_KERNEL(eye,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
index 668d673bd32..519a57f3287 100644
--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/flip_kernel.h"
-
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/flip_kernel.h"
 
 namespace phi {
 
@@ -115,15 +114,15 @@ void FlipKernel(const Context& dev_ctx,
                        bytes,
                        dev_ctx.stream());
 
-  flip_cuda_kernel<T><<<dim_grid, dim_block, 0, dev_ctx.stream()>>>(
-      N,
-      in_data,
-      out_data,
-      x_shape_array_gpu,
-      x_strides_array_gpu,
-      flip_dims_array_gpu,
-      flip_dims_size,
-      total_dims);
+  flip_cuda_kernel<T>
+      <<<dim_grid, dim_block, 0, dev_ctx.stream()>>>(N,
+                                                     in_data,
+                                                     out_data,
+                                                     x_shape_array_gpu,
+                                                     x_strides_array_gpu,
+                                                     flip_dims_array_gpu,
+                                                     flip_dims_size,
+                                                     total_dims);
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu
index 221bf1cb4c6..9011bb8c5d2 100644
--- a/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
 #include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-
 PD_REGISTER_KERNEL(frobenius_norm_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
index 012237165b7..b921d2d6403 100644
--- a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/frobenius_norm_kernel.h"
 #include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-
 PD_REGISTER_KERNEL(
     frobenius_norm, GPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index 50e57a46317..b823bb6aa67 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/full_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gather_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_grad_kernel.cu
index 04149a2f9ee..6965c2b0c24 100644
--- a/paddle/phi/kernels/gpu/gather_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gather_kernel.h"
-
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/gather_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gather_kernel.cu b/paddle/phi/kernels/gpu/gather_kernel.cu
index 7e0c6cc1685..ffec56073e6 100644
--- a/paddle/phi/kernels/gpu/gather_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gather_kernel.h"
-
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/gather_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
index 2906b81cb40..0172933d1be 100644
--- a/paddle/phi/kernels/gpu/gather_tree_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gather_tree_kernel.h"
-
 #include <algorithm>
+
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gather_tree_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
index b80634357d6..0494c38b213 100644
--- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -12,17 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gaussian_random_kernel.h"
-
 #include <thrust/random.h>
+
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
-
-#include "paddle/fluid/framework/generator.h"
+#include "paddle/phi/kernels/gaussian_random_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
index 2b9be7c6154..247e107b774 100644
--- a/paddle/phi/kernels/gpu/gelu_funcs.h
+++ b/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -48,8 +48,9 @@ template <bool FastMode>
 static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) {
   auto tanh_out =
       FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
-  auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) *
-                         (0.79788456f + 0.1070322243f * x * x)) +
+  auto tmp = 0.5f * x *
+                 ((1.0f - tanh_out * tanh_out) *
+                  (0.79788456f + 0.1070322243f * x * x)) +
              0.5f * (1.0f + tanh_out);
   return tmp * y_g;
 }
@@ -115,9 +116,8 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
       block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
       VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block          \
                << " , thread = " << thread;                                   \
-      FP16FastGeluFwdCUDAKernel<                                              \
-          __vec_size,                                                         \
-          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y, n);  \
+      FP16FastGeluFwdCUDAKernel<__vec_size, __use_fast_math>                  \
+          <<<block, thread, 0, dev_ctx.stream()>>>(x, y, n);                  \
       return true;                                                            \
     }                                                                         \
   } while (0)
@@ -154,10 +154,8 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
       block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
       VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block          \
                << " , thread = " << thread;                                   \
-      FP16FastGeluBwdCUDAKernel<                                              \
-          __vec_size,                                                         \
-          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(           \
-          x, y_g, x_g, n);                                                    \
+      FP16FastGeluBwdCUDAKernel<__vec_size, __use_fast_math>                  \
+          <<<block, thread, 0, dev_ctx.stream()>>>(x, y_g, x_g, n);           \
       return true;                                                            \
     }                                                                         \
   } while (0)
diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
index 1f33d5c901f..7ed2b6b71fb 100644
--- a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format off
 #include "paddle/phi/kernels/gelu_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -20,6 +21,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/gpu/gelu_funcs.h"
+// clang-format on
 
 DECLARE_bool(use_fast_math);
 
diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu
index 00dc58df0d8..509a5ccf4d1 100644
--- a/paddle/phi/kernels/gpu/gelu_kernel.cu
+++ b/paddle/phi/kernels/gpu/gelu_kernel.cu
@@ -12,6 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// clang-format will try to sort headers according to google c++ style,
+// and that cause compiling problems.
+// clang-format off
 #include "paddle/phi/kernels/gelu_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -20,6 +23,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/gpu/gelu_funcs.h"
+// clang-format on
 
 DECLARE_bool(use_fast_math);
 
diff --git a/paddle/phi/kernels/gpu/graph_reindex_funcs.h b/paddle/phi/kernels/gpu/graph_reindex_funcs.h
index ea4f67e9d47..0a6d6a549a7 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_reindex_funcs.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/graph_reindex_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/graph_reindex_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index 9c6c83a738f..c6be5231fe2 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -18,11 +18,10 @@
 #include <thrust/scan.h>
 #include <thrust/sequence.h>
 
-#include "paddle/phi/kernels/gpu/graph_reindex_funcs.h"
-#include "paddle/phi/kernels/graph_reindex_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/graph_reindex_funcs.h"
+#include "paddle/phi/kernels/graph_reindex_kernel.h"
 
 namespace phi {
 
@@ -92,8 +91,8 @@ void FillBufferHashTable(const Context& dev_ctx,
   int grid_tmp = (num_input + block - 1) / block;
   int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   // Insert data.
-  BuildHashTable<T><<<grid, block, 0, dev_ctx.stream()>>>(
-      input, num_input, key_index);
+  BuildHashTable<T>
+      <<<grid, block, 0, dev_ctx.stream()>>>(input, num_input, key_index);
 
   // Get item index count.
   thrust::device_vector<int> item_count(num_input + 1, 0);
@@ -348,14 +347,13 @@ void GraphReindexKernel(const Context& dev_ctx,
     thrust::exclusive_scan(
         count_data + i * bs, count_data + (i + 1) * bs, dst_ptr.begin());
 
-    GetDstEdgeCUDAKernel<T,
-                         BLOCK_WARPS,
-                         TILE_SIZE><<<grid, block, 0, dev_ctx.stream()>>>(
-        bs,
-        thrust::raw_pointer_cast(unique_dst_reindex.data()),
-        count_data + i * bs,
-        thrust::raw_pointer_cast(dst_ptr.data()),
-        reindex_dst_data + begin);
+    GetDstEdgeCUDAKernel<T, BLOCK_WARPS, TILE_SIZE>
+        <<<grid, block, 0, dev_ctx.stream()>>>(
+            bs,
+            thrust::raw_pointer_cast(unique_dst_reindex.data()),
+            count_data + i * bs,
+            thrust::raw_pointer_cast(dst_ptr.data()),
+            reindex_dst_data + begin);
 
     int count_i =
         thrust::reduce(thrust::device_pointer_cast(count_data) + i * bs,
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index 174495dad34..c1e9184b222 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -27,11 +27,10 @@
 #include <curand_kernel.h>
 #endif
 
-#include "paddle/phi/kernels/graph_sample_neighbors_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/graph_sample_neighbors_kernel.h"
 
 namespace phi {
 
@@ -175,21 +174,19 @@ void SampleNeighbors(const Context& dev_ctx,
   constexpr int TILE_SIZE = BLOCK_WARPS * 16;
   const dim3 block(WARP_SIZE, BLOCK_WARPS);
   const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE);
-  SampleKernel<T,
-               WARP_SIZE,
-               BLOCK_WARPS,
-               TILE_SIZE><<<grid, block, 0, dev_ctx.stream()>>>(
-      0,
-      sample_size,
-      bs,
-      thrust::raw_pointer_cast(input),
-      row,
-      col_ptr,
-      eids,
-      thrust::raw_pointer_cast(output),
-      thrust::raw_pointer_cast(output_eids),
-      thrust::raw_pointer_cast(output_ptr.data()),
-      return_eids);
+  SampleKernel<T, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+      <<<grid, block, 0, dev_ctx.stream()>>>(
+          0,
+          sample_size,
+          bs,
+          thrust::raw_pointer_cast(input),
+          row,
+          col_ptr,
+          eids,
+          thrust::raw_pointer_cast(output),
+          thrust::raw_pointer_cast(output_eids),
+          thrust::raw_pointer_cast(output_ptr.data()),
+          return_eids);
 }
 
 template <typename T, int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
@@ -327,27 +324,27 @@ void FisherYatesSampleNeighbors(const Context& dev_ctx,
   const dim3 block(WARP_SIZE, BLOCK_WARPS);
   const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE);
 
-  FisherYatesSampleKernel<T,
-                          WARP_SIZE,
-                          BLOCK_WARPS,
-                          TILE_SIZE><<<grid, block, 0, dev_ctx.stream()>>>(
-      0, sample_size, bs, thrust::raw_pointer_cast(input), perm_data, col_ptr);
-
-  GatherEdge<T,
-             WARP_SIZE,
-             BLOCK_WARPS,
-             TILE_SIZE><<<grid, block, 0, dev_ctx.stream()>>>(
-      sample_size,
-      bs,
-      thrust::raw_pointer_cast(input),
-      row,
-      col_ptr,
-      eids,
-      thrust::raw_pointer_cast(output),
-      thrust::raw_pointer_cast(output_eids),
-      thrust::raw_pointer_cast(output_ptr.data()),
-      perm_data,
-      return_eids);
+  FisherYatesSampleKernel<T, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+      <<<grid, block, 0, dev_ctx.stream()>>>(0,
+                                             sample_size,
+                                             bs,
+                                             thrust::raw_pointer_cast(input),
+                                             perm_data,
+                                             col_ptr);
+
+  GatherEdge<T, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+      <<<grid, block, 0, dev_ctx.stream()>>>(
+          sample_size,
+          bs,
+          thrust::raw_pointer_cast(input),
+          row,
+          col_ptr,
+          eids,
+          thrust::raw_pointer_cast(output),
+          thrust::raw_pointer_cast(output_eids),
+          thrust::raw_pointer_cast(output_ptr.data()),
+          perm_data,
+          return_eids);
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
index 1eab521170b..a93603ae18f 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
@@ -13,16 +13,16 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/kernels/graph_send_recv_kernel.h"
-
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
+
 #include <algorithm>
 #include <vector>
 
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
index 8743b4e8a74..b00d9931646 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
@@ -12,15 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
-#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
-
 #include <algorithm>
 #include <vector>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
 
 namespace phi {
 
@@ -75,12 +74,9 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
   int64_t input_size = src_dims[0];
   if (pool_type == "SUM") {
     GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<
-        T,
-        IndexT,
-        GraphSendRecvSumCUDAFunctor<T,
-                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
-        p_src, d_index, s_index, p_output, index_size, slice_size, functor);
+    GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvSumCUDAFunctor<T, IndexT>>
+        <<<grid, block, 0, ctx.stream()>>>(
+            p_src, d_index, s_index, p_output, index_size, slice_size, functor);
   } else if (pool_type == "MEAN") {
     const int32_t* s_count = dst_count->data<int32_t>();
     ManipulateMeanGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
@@ -88,15 +84,15 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
   } else if (pool_type == "MAX" || pool_type == "MIN") {
     const T* ptr_input = x.data<T>();
     const T* ptr_output = out->data<T>();
-    ManipulateMinMaxGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
-        p_src,
-        d_index,
-        s_index,
-        p_output,
-        index_size,
-        slice_size,
-        ptr_input,
-        ptr_output);
+    ManipulateMinMaxGradCUDAKernel<T, IndexT>
+        <<<grid, block, 0, ctx.stream()>>>(p_src,
+                                           d_index,
+                                           s_index,
+                                           p_output,
+                                           index_size,
+                                           slice_size,
+                                           ptr_input,
+                                           ptr_output);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
index 2826c071d6e..446a2361aed 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
-#include "paddle/phi/kernels/graph_send_recv_kernel.h"
-
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
+
 #include <algorithm>
 #include <vector>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
 
 namespace phi {
 
@@ -93,20 +93,14 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
   int64_t input_size = src_dims[0];
   if (pool_type == "SUM") {
     GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<
-        T,
-        IndexT,
-        GraphSendRecvSumCUDAFunctor<T,
-                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
-        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+    GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvSumCUDAFunctor<T, IndexT>>
+        <<<grid, block, 0, ctx.stream()>>>(
+            p_src, s_index, d_index, p_output, index_size, slice_size, functor);
   } else if (pool_type == "MAX") {
     GraphSendRecvMaxCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<
-        T,
-        IndexT,
-        GraphSendRecvMaxCUDAFunctor<T,
-                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
-        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+    GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvMaxCUDAFunctor<T, IndexT>>
+        <<<grid, block, 0, ctx.stream()>>>(
+            p_src, s_index, d_index, p_output, index_size, slice_size, functor);
 
     if (out_size > 0) {
       input_size = out_size;
@@ -118,12 +112,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
         p_output, input_size, slice_size);
   } else if (pool_type == "MIN") {
     GraphSendRecvMinCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<
-        T,
-        IndexT,
-        GraphSendRecvMinCUDAFunctor<T,
-                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
-        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+    GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvMinCUDAFunctor<T, IndexT>>
+        <<<grid, block, 0, ctx.stream()>>>(
+            p_src, s_index, d_index, p_output, index_size, slice_size, functor);
 
     if (out_size > 0) {
       input_size = out_size;
@@ -135,12 +126,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
         p_output, input_size, slice_size);
   } else if (pool_type == "MEAN") {
     GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<
-        T,
-        IndexT,
-        GraphSendRecvSumCUDAFunctor<T,
-                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
-        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+    GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvSumCUDAFunctor<T, IndexT>>
+        <<<grid, block, 0, ctx.stream()>>>(
+            p_src, s_index, d_index, p_output, index_size, slice_size, functor);
 
     ctx.template Alloc<int32_t>(dst_count);
     int32_t* p_dst_count = dst_count->data<int32_t>();
diff --git a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
index 457a348be83..40633fed348 100644
--- a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
@@ -12,16 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
-
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/grid_sample_utils.h"
-
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
 
 namespace phi {
 
@@ -295,23 +293,23 @@ void GridSampleGradKernel(const Context& dev_ctx,
   auto cu_stream = dev_ctx.stream();
   backends::gpu::GpuLaunchConfig config =
       backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
-  GridSamplerCudaBackwardKernel<
-      T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-      count,
-      out_grad.data<T>(),
-      x.data<T>(),
-      grid.data<T>(),
-      n,
-      c,
-      out_h,
-      out_w,
-      in_h,
-      in_w,
-      x_grad->data<T>(),
-      grid_grad_data,
-      enum_mode,
-      enum_padding_mode,
-      align_corners);
+  GridSamplerCudaBackwardKernel<T>
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+          count,
+          out_grad.data<T>(),
+          x.data<T>(),
+          grid.data<T>(),
+          n,
+          c,
+          out_h,
+          out_w,
+          in_h,
+          in_w,
+          x_grad->data<T>(),
+          grid_grad_data,
+          enum_mode,
+          enum_padding_mode,
+          align_corners);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/grid_sample_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
index f611b46911c..4a5d567caa1 100644
--- a/paddle/phi/kernels/gpu/grid_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/grid_sample_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/grid_sample_kernel.h"
 
 namespace phi {
 
@@ -210,21 +209,21 @@ void GridSampleKernel(const Context& dev_ctx,
   auto cu_stream = dev_ctx.stream();
   backends::gpu::GpuLaunchConfig config =
       backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
-  GridSampleCudaKernel<
-      T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-      count,
-      n,
-      c,
-      out_h,
-      out_w,
-      in_h,
-      in_w,
-      x.data<T>(),
-      grid.data<T>(),
-      output_data,
-      enum_mode,
-      enum_padding_mode,
-      align_corners);
+  GridSampleCudaKernel<T>
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+          count,
+          n,
+          c,
+          out_h,
+          out_w,
+          in_h,
+          in_w,
+          x.data<T>(),
+          grid.data<T>(),
+          output_data,
+          enum_mode,
+          enum_padding_mode,
+          align_corners);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
index a28a7512f49..71d9859c8a3 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gumbel_softmax_grad_kernel.h"
 #include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-
 PD_REGISTER_KERNEL(gumbel_softmax_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index c0e557f09bc..d68c77de02f 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/gumbel_softmax_kernel.h"
-#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/gumbel_softmax_kernel.h"
+#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #ifdef __NVCC__
@@ -105,14 +104,14 @@ struct OneHotGenerator<GPUContext, T> {
     ctx.template Alloc<T>(&input_tensor);
     paddle::framework::TensorCopy(*out, ctx.GetPlace(), &input_tensor);
     funcs::set_constant(ctx, out, 0.0);
-    OneHotCUDAKernel<T,
-                     thread_size><<<block_size, thread_size, 0, ctx.stream()>>>(
-        height,
-        size_from_axis / size_out_axis,
-        size_out_axis,
-        std::numeric_limits<T>::lowest(),
-        input_tensor.data<T>(),
-        out->data<T>());
+    OneHotCUDAKernel<T, thread_size>
+        <<<block_size, thread_size, 0, ctx.stream()>>>(
+            height,
+            size_from_axis / size_out_axis,
+            size_out_axis,
+            std::numeric_limits<T>::lowest(),
+            input_tensor.data<T>(),
+            out->data<T>());
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/histogram_kernel.cu b/paddle/phi/kernels/gpu/histogram_kernel.cu
index c5eb5220537..2950aef15ca 100644
--- a/paddle/phi/kernels/gpu/histogram_kernel.cu
+++ b/paddle/phi/kernels/gpu/histogram_kernel.cu
@@ -12,17 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/histogram_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/histogram_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/increment_kernel.cu b/paddle/phi/kernels/gpu/increment_kernel.cu
index b3c31271911..8e7cb6e8b3c 100644
--- a/paddle/phi/kernels/gpu/increment_kernel.cu
+++ b/paddle/phi/kernels/gpu/increment_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/increment_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/increment_kernel_impl.h"
+#include "paddle/phi/kernels/increment_kernel.h"
 
 PD_REGISTER_KERNEL(increment,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
index c8c025c7fc1..b763f05531d 100644
--- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/index_sample_grad_kernel.h"
-
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/index_sample_grad_kernel.h"
 
 namespace phi {
 
@@ -107,24 +107,24 @@ void IndexSampleGradKernel(const Context& ctx,
 
   if (index_type == DataType::INT64) {
     const int64_t* index_data = index.data<int64_t>();
-    IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
-        index_data,
-        input_grad_data,
-        output_grad_data,
-        index_length,
-        input_length,
-        batch_size,
-        same_data_in_index_row);
+    IndexSampleGrad<T, int64_t>
+        <<<grid_dim, block_dim, 0, stream>>>(index_data,
+                                             input_grad_data,
+                                             output_grad_data,
+                                             index_length,
+                                             input_length,
+                                             batch_size,
+                                             same_data_in_index_row);
   } else if (index_type == DataType::INT32) {
     const int* index_data = index.data<int>();
-    IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
-        index_data,
-        input_grad_data,
-        output_grad_data,
-        index_length,
-        input_length,
-        batch_size,
-        same_data_in_index_row);
+    IndexSampleGrad<T, int>
+        <<<grid_dim, block_dim, 0, stream>>>(index_data,
+                                             input_grad_data,
+                                             output_grad_data,
+                                             index_length,
+                                             input_length,
+                                             batch_size,
+                                             same_data_in_index_row);
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu
index 0eca473a565..702c955cd7e 100644
--- a/paddle/phi/kernels/gpu/index_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/index_sample_kernel.h"
-
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/index_sample_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
index 84094f4c1ee..75f74e4afdf 100644
--- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/index_select_grad_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/index_select_grad_kernel.h"
 
 DECLARE_bool(cudnn_deterministic);
 
@@ -100,27 +99,26 @@ void IndexSelectGradKernel(const Context& ctx,
 
   if (index_type == phi::DataType::INT64) {
     const int64_t* index_data = index.data<int64_t>();
-    index_select_grad_cuda_kernel<T,
-                                  int64_t><<<grid_dim, block_dim, 0, stream>>>(
-        output_grad_data,
-        in_grad_data,
-        index_data,
-        index_nums,
-        out_nums,
-        stride,
-        size,
-        delta);
+    index_select_grad_cuda_kernel<T, int64_t>
+        <<<grid_dim, block_dim, 0, stream>>>(output_grad_data,
+                                             in_grad_data,
+                                             index_data,
+                                             index_nums,
+                                             out_nums,
+                                             stride,
+                                             size,
+                                             delta);
   } else {
     const int* index_data = index.data<int>();
-    index_select_grad_cuda_kernel<T, int><<<grid_dim, block_dim, 0, stream>>>(
-        output_grad_data,
-        in_grad_data,
-        index_data,
-        index_nums,
-        out_nums,
-        stride,
-        size,
-        delta);
+    index_select_grad_cuda_kernel<T, int>
+        <<<grid_dim, block_dim, 0, stream>>>(output_grad_data,
+                                             in_grad_data,
+                                             index_data,
+                                             index_nums,
+                                             out_nums,
+                                             stride,
+                                             size,
+                                             delta);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu
index 0a6ac69cef0..d2a2bff075b 100644
--- a/paddle/phi/kernels/gpu/index_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/index_select_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/index_select_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index b2c2df2d3f0..bdc81b59f14 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -21,6 +19,7 @@
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
 #include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
 
 namespace phi {
 template <typename T, int BlockDim>
@@ -274,10 +273,10 @@ __global__ void DoubleGradComputeDScale(const T *x,
   if (ddx != nullptr) {
     T dscale_tmp = 0;
     for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dscale_tmp +=
-          ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size -
-                              dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) *
-                                  var_val * var_val / sample_size);
+      dscale_tmp += ddx[i] * var_val *
+                    (dy[i] - dy_sum_val / sample_size -
+                     dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) * var_val *
+                         var_val / sample_size);
     }
     dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
     if (threadIdx.x == 0) {
@@ -563,18 +562,18 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
   if (dx) {
     T *dx_data = dev_ctx.template Alloc<T>(dx);
     set_zero(dev_ctx, dx, static_cast<T>(0));
-    DoubleGradComputeDX<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-        x_data,
-        mean_data,
-        variance_data,
-        ddx_data,
-        dy_data,
-        scale_data,
-        ddscale_data,
-        C,
-        sample_size,
-        epsilon,
-        dx_data);
+    DoubleGradComputeDX<T, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddx_data,
+                                               dy_data,
+                                               scale_data,
+                                               ddscale_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               dx_data);
   }
   if (dscale) {
     DenseTensor dscale_tmp;
@@ -585,34 +584,34 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
 
     T *dscale_data = dev_ctx.template Alloc<T>(dscale);
     set_zero(dev_ctx, dscale, static_cast<T>(0));
-    DoubleGradComputeDScale<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-        x_data,
-        mean_data,
-        variance_data,
-        ddx_data,
-        dy_data,
-        C,
-        sample_size,
-        epsilon,
-        dscale_tmp_data);
+    DoubleGradComputeDScale<T, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddx_data,
+                                               dy_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               dscale_tmp_data);
     add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
         dscale_tmp.data<T>(), dscale->data<T>(), N, C);
   }
   if (ddy) {
     T *ddy_data = dev_ctx.template Alloc<T>(ddy);
     set_zero(dev_ctx, ddy, static_cast<T>(0));
-    DoubleGradComputeDDY<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-        x_data,
-        mean_data,
-        variance_data,
-        ddscale_data,
-        ddbias_data,
-        ddx_data,
-        scale_data,
-        C,
-        sample_size,
-        epsilon,
-        ddy_data);
+    DoubleGradComputeDDY<T, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddscale_data,
+                                               ddbias_data,
+                                               ddx_data,
+                                               scale_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               ddy_data);
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
index b7292236898..c7696c2dab8 100644
--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/instance_norm_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
 #include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+#include "paddle/phi/kernels/instance_norm_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
index cd0f4e1493e..4b27e5dd359 100644
--- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/interpolate_grad_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -23,6 +21,7 @@
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
 #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/interpolate_grad_kernel.h"
 
 namespace phi {
 
@@ -1059,21 +1058,21 @@ static void Interpolate2DCUDABwd(
     } else if (!optimize_flag & is_nchw) {
       const int num_kernels = n * c * out_h * out_w;
       const int num_threads = std::min(dev_ctx.GetMaxThreadsPerBlock(), 1024);
-      KeBilinearInterpNCHWBw<
-          T><<<backends::gpu::DivUp(num_kernels, num_threads),
-               num_threads,
-               0,
-               dev_ctx.stream()>>>(input_grad_data,
-                                   in_h,
-                                   in_w,
-                                   out_h,
-                                   out_w,
-                                   n,
-                                   c,
-                                   ratio_h,
-                                   ratio_w,
-                                   output_grad_data,
-                                   align_type_value);
+      KeBilinearInterpNCHWBw<T>
+          <<<backends::gpu::DivUp(num_kernels, num_threads),
+             num_threads,
+             0,
+             dev_ctx.stream()>>>(input_grad_data,
+                                 in_h,
+                                 in_w,
+                                 out_h,
+                                 out_w,
+                                 n,
+                                 c,
+                                 ratio_h,
+                                 ratio_w,
+                                 output_grad_data,
+                                 align_type_value);
     } else {
       int64_t cw = c * out_w;
       auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw);
@@ -1100,23 +1099,23 @@ static void Interpolate2DCUDABwd(
 #else
     constexpr int thread_per_block = 512;
 #endif
-    KeBicubicInterpBw<
-        T><<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
-        input_grad_data,
-        in_h,
-        in_w,
-        n,
-        in_chw,
-        output_grad_data,
-        out_h,
-        out_w,
-        n,
-        out_chw,
-        c,
-        ratio_h,
-        ratio_w,
-        align_corners,
-        data_layout);
+    KeBicubicInterpBw<T>
+        <<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
+            input_grad_data,
+            in_h,
+            in_w,
+            n,
+            in_chw,
+            output_grad_data,
+            out_h,
+            out_w,
+            n,
+            out_chw,
+            c,
+            ratio_h,
+            ratio_w,
+            align_corners,
+            data_layout);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu
index 3bd59c80710..108449c52ad 100644
--- a/paddle/phi/kernels/gpu/interpolate_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/interpolate_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
+#include "paddle/phi/kernels/interpolate_kernel.h"
 
 namespace phi {
 using paddle::platform::FastDivMod;
@@ -949,23 +947,23 @@ static void Interpolate2DCUDAFwd(
     } else {
       int64_t cw = c * out_w;
       auto interp_divmods = funcs::FastDivModForInterpolate(c, out_chw, cw);
-      KeBilinearInterpFw<
-          T><<<config.block_per_grid, thread_num, 0, dev_ctx.stream()>>>(
-          input_data,
-          in_h,
-          in_w,
-          n,
-          in_chw,
-          output_data,
-          out_h,
-          out_w,
-          n,
-          out_chw,
-          c,
-          ratio_h,
-          ratio_w,
-          align_type_value,
-          interp_divmods);
+      KeBilinearInterpFw<T>
+          <<<config.block_per_grid, thread_num, 0, dev_ctx.stream()>>>(
+              input_data,
+              in_h,
+              in_w,
+              n,
+              in_chw,
+              output_data,
+              out_h,
+              out_w,
+              n,
+              out_chw,
+              c,
+              ratio_h,
+              ratio_w,
+              align_type_value,
+              interp_divmods);
     }
   } else if ("bicubic" == interp_method) {
 #ifdef __HIPCC__
@@ -973,23 +971,23 @@ static void Interpolate2DCUDAFwd(
 #else
     constexpr int thread_per_block = 512;
 #endif
-    KeBicubicInterpFw<
-        T><<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
-        input_data,
-        in_h,
-        in_w,
-        n,
-        in_chw,
-        output_data,
-        out_h,
-        out_w,
-        n,
-        out_chw,
-        c,
-        ratio_h,
-        ratio_w,
-        align_corners,
-        data_layout);
+    KeBicubicInterpFw<T>
+        <<<config.block_per_grid, thread_per_block, 0, dev_ctx.stream()>>>(
+            input_data,
+            in_h,
+            in_w,
+            n,
+            in_chw,
+            output_data,
+            out_h,
+            out_w,
+            n,
+            out_chw,
+            c,
+            ratio_h,
+            ratio_w,
+            align_corners,
+            data_layout);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/isclose_kernel.cu b/paddle/phi/kernels/gpu/isclose_kernel.cu
index 34774ec715c..20540521510 100644
--- a/paddle/phi/kernels/gpu/isclose_kernel.cu
+++ b/paddle/phi/kernels/gpu/isclose_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/isclose_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
+#include "paddle/phi/kernels/isclose_kernel.h"
 
 PD_REGISTER_KERNEL(
     isclose, GPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
index 8ca53f021f0..de3f6bc3f40 100644
--- a/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h"
+#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
 PD_REGISTER_KERNEL(
     kldiv_loss_grad, GPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) {
 }
diff --git a/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
index 9388ac7071c..adaf6963bb8 100644
--- a/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/kldiv_loss_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h"
+#include "paddle/phi/kernels/kldiv_loss_kernel.h"
 PD_REGISTER_KERNEL(
     kldiv_loss, GPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/kron_grad_kernel.cu b/paddle/phi/kernels/gpu/kron_grad_kernel.cu
index 13ef2adaab3..4f4e329ed84 100644
--- a/paddle/phi/kernels/gpu/kron_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/kron_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/kron_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/kron_grad_kernel_impl.h"
+#include "paddle/phi/kernels/kron_grad_kernel.h"
 
 PD_REGISTER_KERNEL(kron_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/kron_kernel.cu b/paddle/phi/kernels/gpu/kron_kernel.cu
index a2124fd5af7..3d2b1573e89 100644
--- a/paddle/phi/kernels/gpu/kron_kernel.cu
+++ b/paddle/phi/kernels/gpu/kron_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/kron_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/kron_kernel_impl.h"
+#include "paddle/phi/kernels/kron_kernel.h"
 
 PD_REGISTER_KERNEL(kron,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
index bcd370a72d9..4b317e3b942 100644
--- a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
-
 #include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
 
 namespace phi {
 static int getBlockSize(int col) {
@@ -53,9 +52,9 @@ void KthvalueGradKernel(const Context& dev_ctx,
   int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
   int grid_size = std::min(max_blocks, pre);
-  paddle::operators::AssignGradWithAxis<
-      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-      out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+  paddle::operators::AssignGradWithAxis<T>
+      <<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+          out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
index 4218e153ec2..bd2b16fb378 100644
--- a/paddle/phi/kernels/gpu/kthvalue_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/kthvalue_kernel.h"
-
 #include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/kthvalue_kernel.h"
 
 namespace phi {
 inline int getBlockSize(int col) {
@@ -55,9 +54,9 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
   unsigned int grid_size = num_rows < maxGridDimX
                                ? static_cast<unsigned int>(num_rows)
                                : maxGridDimX;
-  paddle::operators::InitIndex<
-      int64_t><<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<int64_t>(), num_rows, num_cols);
+  paddle::operators::InitIndex<int64_t>
+      <<<grid_size, block_size, 0, cu_stream>>>(
+          input_indices.data<int64_t>(), num_rows, num_cols);
   cub::CountingInputIterator<int64_t> counting_iter(0);
   cub::TransformInputIterator<int64_t,
                               paddle::operators::SegmentOffsetIter,
diff --git a/paddle/phi/kernels/gpu/label_smooth_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
index bf7ac939eb3..2bcb0ce5f3a 100644
--- a/paddle/phi/kernels/gpu/label_smooth_kernel.cu
+++ b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <vector>
+
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
index 961937441e1..5a399361aaa 100644
--- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
-
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 665913893e0..10aeba339cb 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/layer_norm_kernel.h"
-
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"
 
 namespace phi {
 
@@ -37,11 +36,10 @@ void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
   int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
   int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
   switch (paddle::operators::GetDesiredBlockDim(feature_size)) {
-    FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<
-                         T,
-                         T,
-                         kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-        input, scale, bias, output, mean, variance, eps, feature_size));
+    FIXED_BLOCK_DIM_CASE(
+        paddle::operators::LayerNormForward<T, T, kBlockDim>
+        <<<batch_size, kBlockDim, 0, stream>>>(
+            input, scale, bias, output, mean, variance, eps, feature_size));
     default:
       PADDLE_THROW(phi::errors::InvalidArgument(
           "Product from begin_norm_axis to end in layer_norm must be larger "
@@ -108,22 +106,18 @@ void LayerNormKernel(const Context &dev_ctx,
 #define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
   do {                                                                     \
     switch (paddle::operators::GetDesiredBlockDim(feature_size)) {         \
-      FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<            \
-                           T,                                              \
-                           U,                                              \
-                           kBlockDim,                                      \
-                           IsScaleBiasSameDTypeWithX><<<batch_size,        \
-                                                        kBlockDim,         \
-                                                        0,                 \
-                                                        stream>>>(         \
-          x_data,                                                          \
-          static_cast<const ScaleBiasT *>(void_scale_data),                \
-          static_cast<const ScaleBiasT *>(void_bias_data),                 \
-          y_data,                                                          \
-          mean_data,                                                       \
-          var_data,                                                        \
-          epsilon,                                                         \
-          feature_size));                                                  \
+      FIXED_BLOCK_DIM_CASE(                                                \
+          paddle::operators::                                              \
+              LayerNormForward<T, U, kBlockDim, IsScaleBiasSameDTypeWithX> \
+          <<<batch_size, kBlockDim, 0, stream>>>(                          \
+              x_data,                                                      \
+              static_cast<const ScaleBiasT *>(void_scale_data),            \
+              static_cast<const ScaleBiasT *>(void_bias_data),             \
+              y_data,                                                      \
+              mean_data,                                                   \
+              var_data,                                                    \
+              epsilon,                                                     \
+              feature_size));                                              \
       default:                                                             \
         PADDLE_THROW(phi::errors::InvalidArgument(                         \
             "Product from begin_norm_axis to end must be larger than 1")); \
@@ -142,23 +136,23 @@ void LayerNormKernel(const Context &dev_ctx,
     const int ROWS_PER_CTA = WARPS_M;                                        \
     const int grid = static_cast<int>(                                       \
         std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));           \
-    paddle::operators::fast_ln_fwd_kernel<                                   \
-        T,                                                                   \
-        U,                                                                   \
-        ScaleT,                                                              \
-        VecSize,                                                             \
-        WARPS_M,                                                             \
-        WARPS_N,                                                             \
-        BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(                \
-        batch_size,                                                          \
-        feature_size,                                                        \
-        epsilon,                                                             \
-        x_data,                                                              \
-        static_cast<const ScaleT *>(void_scale_data),                        \
-        static_cast<const ScaleT *>(void_bias_data),                         \
-        mean_data,                                                           \
-        var_data,                                                            \
-        y_data);                                                             \
+    paddle::operators::fast_ln_fwd_kernel<T,                                 \
+                                          U,                                 \
+                                          ScaleT,                            \
+                                          VecSize,                           \
+                                          WARPS_M,                           \
+                                          WARPS_N,                           \
+                                          BYTES_PER_LDG>                     \
+        <<<grid, THREADS_PER_CTA, 0, stream>>>(                              \
+            batch_size,                                                      \
+            feature_size,                                                    \
+            epsilon,                                                         \
+            x_data,                                                          \
+            static_cast<const ScaleT *>(void_scale_data),                    \
+            static_cast<const ScaleT *>(void_bias_data),                     \
+            mean_data,                                                       \
+            var_data,                                                        \
+            y_data);                                                         \
   } break
 
 #define PADDLE_LAUNCH_FAST_LAYERNORM_FWD(ScaleT)       \
diff --git a/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
index 3e4cd21a658..c7e82b8cd7e 100644
--- a/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/lgamma_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h"
+#include "paddle/phi/kernels/lgamma_grad_kernel.h"
 PD_REGISTER_KERNEL(
     lgamma_grad, GPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lgamma_kernel.cu b/paddle/phi/kernels/gpu/lgamma_kernel.cu
index e94d67f4ce3..050002f0555 100644
--- a/paddle/phi/kernels/gpu/lgamma_kernel.cu
+++ b/paddle/phi/kernels/gpu/lgamma_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/lgamma_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/lgamma_kernel.h"
 
 namespace phi {
 template <typename T>
diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu
index 66a3f833d27..f16347cd3b6 100644
--- a/paddle/phi/kernels/gpu/linspace_kernel.cu
+++ b/paddle/phi/kernels/gpu/linspace_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/linspace_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/linspace_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
index 3bb256ad032..bc14bd8f3c7 100644
--- a/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/log_loss_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/log_loss_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/log_loss_grad_kernel_impl.h"
+#include "paddle/phi/kernels/log_loss_grad_kernel.h"
 
 PD_REGISTER_KERNEL(
     log_loss_grad, GPU, ALL_LAYOUT, phi::LogLossGradKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/log_loss_kernel.cu b/paddle/phi/kernels/gpu/log_loss_kernel.cu
index 0934520ea4a..e7982b0b6fd 100644
--- a/paddle/phi/kernels/gpu/log_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/log_loss_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/log_loss_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/log_loss_kernel_impl.h"
+#include "paddle/phi/kernels/log_loss_kernel.h"
 
 PD_REGISTER_KERNEL(log_loss, GPU, ALL_LAYOUT, phi::LogLossKernel, float) {}
diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
index f7b28253655..78d1261df6f 100644
--- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/log_softmax_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+#include "paddle/phi/kernels/log_softmax_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
index d7e34c6c14e..b73bd6d6a9d 100644
--- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/log_softmax_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+#include "paddle/phi/kernels/log_softmax_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/logspace_kernel.cu b/paddle/phi/kernels/gpu/logspace_kernel.cu
index f47b7d35cdc..673e8f04320 100644
--- a/paddle/phi/kernels/gpu/logspace_kernel.cu
+++ b/paddle/phi/kernels/gpu/logspace_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/logspace_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/data_type_transform.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/logspace_kernel.h"
 
 namespace phi {
 
@@ -90,8 +89,8 @@ void LogspaceKernel(const Context& ctx,
     LogspaceKernelInner<T><<<grid, block, 0, stream>>>(
         start_data, stop_data, step, base_data, num, out_data);
   } else {
-    LogspaceSpecialKernel<T><<<grid, block, 0, stream>>>(
-        start_data, base_data, out_data);
+    LogspaceSpecialKernel<T>
+        <<<grid, block, 0, stream>>>(start_data, base_data, out_data);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu
index 490b3e94045..a5555bf7b59 100644
--- a/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/logsumexp_grad_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h"
+#include "paddle/phi/kernels/logsumexp_grad_kernel.h"
 
 PD_REGISTER_KERNEL(
     logsumexp_grad, GPU, ALL_LAYOUT, phi::LogsumexpGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/logsumexp_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_kernel.cu
index 0f07a39ab11..c7c23fc307f 100644
--- a/paddle/phi/kernels/gpu/logsumexp_kernel.cu
+++ b/paddle/phi/kernels/gpu/logsumexp_kernel.cu
@@ -12,12 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/logsumexp_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/impl/logsumexp_kernel_impl.h"
+#include "paddle/phi/kernels/logsumexp_kernel.h"
 
 PD_REGISTER_KERNEL(
     logsumexp, GPU, ALL_LAYOUT, phi::LogsumexpKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu
index 8986c97583e..b443ae6b8fb 100644
--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -17,11 +17,10 @@
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
 
-#include "paddle/phi/kernels/funcs/select_impl.cu.h"
-#include "paddle/phi/kernels/masked_select_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
+#include "paddle/phi/kernels/masked_select_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
index 9c80d5e151c..b6c13360cd4 100644
--- a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
@@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/matmul_grad_kernel.h"
-
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
+#include "paddle/phi/kernels/matmul_grad_kernel.h"
 
 PD_REGISTER_KERNEL(matmul_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/matmul_kernel.cu b/paddle/phi/kernels/gpu/matmul_kernel.cu
index 20c9a5229aa..32d70ae0763 100644
--- a/paddle/phi/kernels/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_kernel.cu
@@ -12,13 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/matmul_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
 
 PD_REGISTER_KERNEL(matmul,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
index 25a9de8f8be..3739d7f2eed 100644
--- a/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
+#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
 
 PD_REGISTER_KERNEL(matrix_power_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/matrix_power_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
index d7ae7d8a3f7..f474090f9db 100644
--- a/paddle/phi/kernels/gpu/matrix_power_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/matrix_power_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
+#include "paddle/phi/kernels/matrix_power_kernel.h"
 
 PD_REGISTER_KERNEL(
     matrix_power, GPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
index 9b889a9b4c0..8d69e6d896a 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
@@ -15,11 +15,10 @@
 #ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
-#include "paddle/phi/kernels/matrix_rank_kernel.h"
-#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/matrix_rank_kernel.h"
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index 84768866cc9..f3030d7f6cd 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -15,10 +15,9 @@
 #ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
-#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
-
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -28,6 +27,7 @@
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
+#include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
 #include "paddle/phi/kernels/reduce_max_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/maxout_grad_kernel.cu b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
index 86ff09fd74b..a405f38523a 100644
--- a/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(
     maxout_grad, GPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/maxout_kernel.cu b/paddle/phi/kernels/gpu/maxout_kernel.cu
index 88776a49f19..e5407a4925c 100644
--- a/paddle/phi/kernels/gpu/maxout_kernel.cu
+++ b/paddle/phi/kernels/gpu/maxout_kernel.cu
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
-
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
 
 PD_REGISTER_KERNEL(maxout, GPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/mean_all_grad_kernel.cu b/paddle/phi/kernels/gpu/mean_all_grad_kernel.cu
index 83d4e3a5773..b1a12b436e2 100644
--- a/paddle/phi/kernels/gpu/mean_all_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/mean_all_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/mean_all_kernel.h"
-
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/mean_all_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/mean_all_kernel.cu b/paddle/phi/kernels/gpu/mean_all_kernel.cu
index 799865be26e..d87b738f4e7 100644
--- a/paddle/phi/kernels/gpu/mean_all_kernel.cu
+++ b/paddle/phi/kernels/gpu/mean_all_kernel.cu
@@ -12,14 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/mean_all_kernel.h"
-
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/mean_all_kernel.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 
-#include "paddle/fluid/memory/memcpy.h"
-
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
index 37f2c40143b..80cf88b3ceb 100644
--- a/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/meshgrid_grad_kernel.cu.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/meshgrid_grad_kernel.h"
-#include "paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(meshgrid_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
index 9d52d1e115d..c8635509794 100644
--- a/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/meshgrid_kernel.cu.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/meshgrid_kernel.h"
-#include "paddle/phi/kernels/impl/meshgrid_kernel_impl.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/meshgrid_kernel_impl.h"
 
 PD_REGISTER_KERNEL(meshgrid,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/mode_grad_kernel.cu b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
index 43502621c2d..77235c1da39 100644
--- a/paddle/phi/kernels/gpu/mode_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/mode_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/mode.h"
+#include "paddle/phi/kernels/mode_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/mode_kernel.cu b/paddle/phi/kernels/gpu/mode_kernel.cu
index 629b9722cd6..ee255f10ebc 100644
--- a/paddle/phi/kernels/gpu/mode_kernel.cu
+++ b/paddle/phi/kernels/gpu/mode_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/mode_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/mode.h"
+#include "paddle/phi/kernels/mode_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/momentum_kernel.cu b/paddle/phi/kernels/gpu/momentum_kernel.cu
index 5a4f5d33e61..5e00e074fe8 100644
--- a/paddle/phi/kernels/gpu/momentum_kernel.cu
+++ b/paddle/phi/kernels/gpu/momentum_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/momentum_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
+#include "paddle/phi/kernels/momentum_kernel.h"
 
 PD_REGISTER_KERNEL(momentum,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
index 6761d945e95..61aeff9f3c7 100644
--- a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
-#include "paddle/phi/kernels/multi_dot_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+#include "paddle/phi/kernels/multi_dot_grad_kernel.h"
 
 using float16 = phi::dtype::float16;
 
diff --git a/paddle/phi/kernels/gpu/multi_dot_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
index 60b1fce5ddd..e890c03c345 100644
--- a/paddle/phi/kernels/gpu/multi_dot_kernel.cu
+++ b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
-#include "paddle/phi/kernels/multi_dot_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+#include "paddle/phi/kernels/multi_dot_kernel.h"
 
 using float16 = phi::dtype::float16;
 
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index 21a506a840c..a4fba88d2e0 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -236,12 +236,12 @@ void MultinomialKernel(const Context& dev_ctx,
   int block_size = num_categories < 512 ? num_categories : 512;
   dim3 block_norm(block_size);
   dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
-  NormalizeProbability<T><<<grid_norm, block_norm, 0, dev_ctx.stream()>>>(
-      norm_probs_data,
-      in_data,
-      sum_rows_data,
-      num_distributions,
-      num_categories);
+  NormalizeProbability<T>
+      <<<grid_norm, block_norm, 0, dev_ctx.stream()>>>(norm_probs_data,
+                                                       in_data,
+                                                       sum_rows_data,
+                                                       num_distributions,
+                                                       num_categories);
 
   // Get cumulative probability of each distribution. It's the same function
   // of ``cumsum`` op.
@@ -277,15 +277,15 @@ void MultinomialKernel(const Context& dev_ctx,
   uint64_t increment = curand4_loop_times * 4;
   auto seed_offset = gen_cuda->IncrementOffset(increment);
 
-  sampleMultinomialWithReplacement<T><<<grid, block, 0, dev_ctx.stream()>>>(
-      num_samples,
-      out_data,
-      num_distributions,
-      num_categories,
-      cumulative_probs_data,
-      norm_probs_data,
-      seed_offset.first,
-      seed_offset.second);
+  sampleMultinomialWithReplacement<T>
+      <<<grid, block, 0, dev_ctx.stream()>>>(num_samples,
+                                             out_data,
+                                             num_distributions,
+                                             num_categories,
+                                             cumulative_probs_data,
+                                             norm_probs_data,
+                                             seed_offset.first,
+                                             seed_offset.second);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
index 21576ab608d..35258280f04 100644
--- a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/multiplex_grad_kernel.h"
-
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/multiplex_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/multiplex_kernel.cu b/paddle/phi/kernels/gpu/multiplex_kernel.cu
index 743448a4686..e1fbd7abdc4 100644
--- a/paddle/phi/kernels/gpu/multiplex_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/multiplex_kernel.h"
-
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/multiplex_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/mv_grad_kernel.cu b/paddle/phi/kernels/gpu/mv_grad_kernel.cu
index 9eb8cd375eb..58788492a74 100644
--- a/paddle/phi/kernels/gpu/mv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/mv_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/mv_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/mv_grad_kernel.h"
 
 namespace phi {
 
@@ -58,9 +57,9 @@ void MvGradKernel(const Context &dev_ctx,
   if (dx) {
     T *dx_data = dev_ctx.template Alloc<T>(dx);
 
-    MVGradDxCUDAKernel<
-        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-        m, n, dout_data, vec_data, dx_data);
+    MVGradDxCUDAKernel<T>
+        <<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+            m, n, dout_data, vec_data, dx_data);
   }
 
   if (dvec) {
diff --git a/paddle/phi/kernels/gpu/mv_kernel.cu b/paddle/phi/kernels/gpu/mv_kernel.cu
index 1faba5a62d2..82122723258 100644
--- a/paddle/phi/kernels/gpu/mv_kernel.cu
+++ b/paddle/phi/kernels/gpu/mv_kernel.cu
@@ -12,11 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/mv_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/impl/mv_kernel_impl.h"
+#include "paddle/phi/kernels/mv_kernel.h"
 
 PD_REGISTER_KERNEL(mv, GPU, ALL_LAYOUT, phi::MvKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
index 1661d396641..d373e3bd9f3 100644
--- a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -21,6 +19,7 @@
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h"
+#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
 
 namespace phi {
 
@@ -74,9 +73,9 @@ void CalcMedianGradKernel(const Context& dev_ctx,
   int64_t pre_dim = numel / stride;
 
   T div_factor = static_cast<T>(2.0);
-  KernelNanmedianGrad<
-      T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-      x_ptr, m_ptr, out_grad_ptr, x_grad_ptr, stride, pre_dim, div_factor);
+  KernelNanmedianGrad<T>
+      <<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+          x_ptr, m_ptr, out_grad_ptr, x_grad_ptr, stride, pre_dim, div_factor);
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
index a67d64c2577..5ebf8637bfe 100644
--- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/nanmedian_kernel.h"
-
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -21,6 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h"
+#include "paddle/phi/kernels/nanmedian_kernel.h"
 #include "paddle/phi/kernels/top_k_kernel.h"
 
 namespace phi {
@@ -218,30 +217,30 @@ void ProcessMedianKernel(const Context& dev_ctx,
   T div_factor = static_cast<T>(2.0);
   T nan_val = std::numeric_limits<T>::quiet_NaN();
   if (should_ignore_nan) {
-    CalcNanmedianKernel<
-        T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        sort_out_ptr,
-        sort_indices_ptr,
-        nan_counts_ptr,
-        m_ptr,
-        o_ptr,
-        is_ori_odd,
-        pre_dim,
-        max_valid_num,
-        stride,
-        div_factor,
-        nan_val);
+    CalcNanmedianKernel<T>
+        <<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+            sort_out_ptr,
+            sort_indices_ptr,
+            nan_counts_ptr,
+            m_ptr,
+            o_ptr,
+            is_ori_odd,
+            pre_dim,
+            max_valid_num,
+            stride,
+            div_factor,
+            nan_val);
   } else {
-    CalcMedianKernel<
-        T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        sort_out_ptr,
-        sort_indices_ptr,
-        m_ptr,
-        o_ptr,
-        div_factor,
-        is_ori_odd,
-        pre_dim,
-        sort_k);
+    CalcMedianKernel<T>
+        <<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+            sort_out_ptr,
+            sort_indices_ptr,
+            m_ptr,
+            o_ptr,
+            div_factor,
+            is_ori_odd,
+            pre_dim,
+            sort_k);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/nll_loss.h b/paddle/phi/kernels/gpu/nll_loss.h
index a457264498f..bb47a2f06f4 100644
--- a/paddle/phi/kernels/gpu/nll_loss.h
+++ b/paddle/phi/kernels/gpu/nll_loss.h
@@ -14,9 +14,11 @@
 
 #pragma once
 #include <thrust/functional.h>
+
 #include <algorithm>
 #include <functional>
 #include <string>
+
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
index 407f33c4008..7b356826f5d 100644
--- a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/nll_loss.h"
+#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
 
 namespace phi {
 template <typename T, typename Context>
@@ -49,25 +48,25 @@ void NllLossGradKernel(const Context& dev_ctx,
     int blocks = NumBlocks(batch_size);
     int threads = kNumCUDAThreads;
     if (reduction == "none") {
-      GPUNLLLossBackward1D_no_reduce<
-          T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
-                                                       label_data,
-                                                       weight_data,
-                                                       dout_data,
-                                                       batch_size,
-                                                       n_classes,
-                                                       ignore_index);
+      GPUNLLLossBackward1D_no_reduce<T>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                     label_data,
+                                                     weight_data,
+                                                     dout_data,
+                                                     batch_size,
+                                                     n_classes,
+                                                     ignore_index);
     } else {
-      GPUNLLLossBackward1D_with_reduce<T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
-          dx_data,
-          total_weight_data,
-          label_data,
-          weight_data,
-          dout_data,
-          batch_size,
-          n_classes,
-          size_average,
-          ignore_index);
+      GPUNLLLossBackward1D_with_reduce<T>
+          <<<1, NTHREADS, 0, dev_ctx.stream()>>>(dx_data,
+                                                 total_weight_data,
+                                                 label_data,
+                                                 weight_data,
+                                                 dout_data,
+                                                 batch_size,
+                                                 n_classes,
+                                                 size_average,
+                                                 ignore_index);
     }
   } else if (x_dims.size() == 4) {
     const auto in_dim2 = x_dims[2];
@@ -78,32 +77,32 @@ void NllLossGradKernel(const Context& dev_ctx,
     int blocks = NumBlocks(out_numel);
     int threads = kNumCUDAThreads;
     if (reduction == "none") {
-      GPUNLLLossBackward2D_no_reduce<
-          T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
-                                                       label_data,
-                                                       weight_data,
-                                                       dout_data,
-                                                       batch_size,
-                                                       n_classes,
-                                                       in_dim2,
-                                                       in_dim3,
-                                                       ignore_index);
+      GPUNLLLossBackward2D_no_reduce<T>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                     label_data,
+                                                     weight_data,
+                                                     dout_data,
+                                                     batch_size,
+                                                     n_classes,
+                                                     in_dim2,
+                                                     in_dim3,
+                                                     ignore_index);
     } else {
       int blocks_per_sample = NumBlocks(map_size) / 128;
       blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
       int total_blocks = blocks_per_sample * batch_size;
-      GPUNLLLossBackward2D_with_reduce<
-          T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
-                                                             total_weight_data,
-                                                             label_data,
-                                                             weight_data,
-                                                             dout_data,
-                                                             batch_size,
-                                                             n_classes,
-                                                             map_size,
-                                                             blocks_per_sample,
-                                                             size_average,
-                                                             ignore_index);
+      GPUNLLLossBackward2D_with_reduce<T>
+          <<<total_blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                           total_weight_data,
+                                                           label_data,
+                                                           weight_data,
+                                                           dout_data,
+                                                           batch_size,
+                                                           n_classes,
+                                                           map_size,
+                                                           blocks_per_sample,
+                                                           size_average,
+                                                           ignore_index);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
index 99a8b10b11b..bdb110fa929 100644
--- a/paddle/phi/kernels/gpu/nll_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/nll_loss_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/nll_loss.h"
+#include "paddle/phi/kernels/nll_loss_kernel.h"
 
 namespace phi {
 
@@ -49,25 +48,25 @@ void NllLossRawKernel(const Context& dev_ctx,
     int blocks = NumBlocks(batch_size);
     int threads = kNumCUDAThreads;
     if (reduction == "none") {
-      GPUNLLLossForward1D_no_reduce<
-          T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
-                                                       x_data,
-                                                       label_data,
-                                                       weight_data,
-                                                       batch_size,
-                                                       n_classes,
-                                                       ignore_index);
+      GPUNLLLossForward1D_no_reduce<T>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                     x_data,
+                                                     label_data,
+                                                     weight_data,
+                                                     batch_size,
+                                                     n_classes,
+                                                     ignore_index);
     } else {
-      GPUNLLLossForward1D_with_reduce<T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
-          out_data,
-          total_weight_data,
-          x_data,
-          label_data,
-          weight_data,
-          batch_size,
-          n_classes,
-          size_average,
-          ignore_index);
+      GPUNLLLossForward1D_with_reduce<T>
+          <<<1, NTHREADS, 0, dev_ctx.stream()>>>(out_data,
+                                                 total_weight_data,
+                                                 x_data,
+                                                 label_data,
+                                                 weight_data,
+                                                 batch_size,
+                                                 n_classes,
+                                                 size_average,
+                                                 ignore_index);
     }
   } else if (x_dims.size() == 4) {
     const auto in_dim2 = x_dims[2];
@@ -77,34 +76,34 @@ void NllLossRawKernel(const Context& dev_ctx,
     int blocks = NumBlocks(out_numel);
     int threads = kNumCUDAThreads;
     if (reduction == "none") {
-      GPUNLLLossForward2D_no_reduce<
-          T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
-                                                       x_data,
-                                                       label_data,
-                                                       weight_data,
-                                                       batch_size,
-                                                       n_classes,
-                                                       in_dim2,
-                                                       in_dim3,
-                                                       ignore_index);
+      GPUNLLLossForward2D_no_reduce<T>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                     x_data,
+                                                     label_data,
+                                                     weight_data,
+                                                     batch_size,
+                                                     n_classes,
+                                                     in_dim2,
+                                                     in_dim3,
+                                                     ignore_index);
     } else {
       int blocks_per_sample = NumBlocks(map_size) / 128;
       blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
       int total_blocks = blocks_per_sample * batch_size;
-      GPUNLLLossForward2D_with_reduce<
-          T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(out_data,
-                                                             total_weight_data,
-                                                             x_data,
-                                                             label_data,
-                                                             weight_data,
-                                                             batch_size,
-                                                             n_classes,
-                                                             map_size,
-                                                             blocks_per_sample,
-                                                             ignore_index);
+      GPUNLLLossForward2D_with_reduce<T>
+          <<<total_blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                           total_weight_data,
+                                                           x_data,
+                                                           label_data,
+                                                           weight_data,
+                                                           batch_size,
+                                                           n_classes,
+                                                           map_size,
+                                                           blocks_per_sample,
+                                                           ignore_index);
       if (size_average) {
-        GPUNLLLossForward2D_size_average<T><<<1, 1, 0, dev_ctx.stream()>>>(
-            out_data, total_weight_data);
+        GPUNLLLossForward2D_size_average<T>
+            <<<1, 1, 0, dev_ctx.stream()>>>(out_data, total_weight_data);
       }
     }
   }
diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
index 43a08b0603e..388e7b889a1 100644
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+
 #include "paddle/phi/kernels/norm_grad_kernel.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -22,11 +23,9 @@
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/phi/common/bfloat16.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
 namespace phi {
@@ -105,8 +104,8 @@ void NormGradKernel(const Context& ctx,
   int max_threads = ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
   int grid = std::min(max_blocks, pre * post);
-  NormalizeGradient<T, block><<<grid, block, 0, ctx.stream()>>>(
-      x_data, x_norm, dy, pre, n, post, dx);
+  NormalizeGradient<T, block>
+      <<<grid, block, 0, ctx.stream()>>>(x_data, x_norm, dy, pre, n, post, dx);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu
index 274f91b8dd6..2877069a226 100644
--- a/paddle/phi/kernels/gpu/norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+
 #include "paddle/phi/kernels/norm_kernel.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -22,11 +23,9 @@
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/phi/common/float16.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
 namespace phi {
@@ -118,8 +117,8 @@ void NormKernel(const Context& ctx,
   int max_threads = ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
   int grid = std::min(max_blocks, pre * post);
-  Normalize<T, block><<<grid, block, 0, ctx.stream()>>>(
-      x_ptr, pre, n, post, eps, y, norm_ptr);
+  Normalize<T, block>
+      <<<grid, block, 0, ctx.stream()>>>(x_ptr, pre, n, post, eps, y, norm_ptr);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu
index 2ae9e9333ec..adc87b049ee 100644
--- a/paddle/phi/kernels/gpu/one_hot_kernel.cu
+++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/one_hot_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/one_hot_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
index fdfed25b3dd..9305f30939f 100644
--- a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/p_norm_grad_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/reduce_grad_functions.h"
+#include "paddle/phi/kernels/p_norm_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/p_norm_kernel.cu b/paddle/phi/kernels/gpu/p_norm_kernel.cu
index 80ef97d9cf8..12038fa2243 100644
--- a/paddle/phi/kernels/gpu/p_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_norm_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/p_norm_kernel.h"
-
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/p_norm_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
index 8f4af0a4508..8832bf6a3a4 100644
--- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/pad3d_grad_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/pad3d_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu
index d1b1d706676..eb8dfa52767 100644
--- a/paddle/phi/kernels/gpu/pad3d_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/pad3d_kernel.h"
-
 #include <algorithm>
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -21,6 +19,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/pad3d_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/pad_grad_kernel.cu b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
index a25472d122b..c5e2e077e41 100644
--- a/paddle/phi/kernels/gpu/pad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/pad_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/pad_grad_kernel_impl.h"
+#include "paddle/phi/kernels/pad_grad_kernel.h"
 
 PD_REGISTER_KERNEL(pad_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
index 6b82cbc6748..6634d863fc1 100644
--- a/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h"
-#include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h"
 
 PD_REGISTER_KERNEL(pixel_shuffle_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
index 25b240c6c1a..8ceb1b70011 100644
--- a/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
+++ b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
-#include "paddle/phi/kernels/pixel_shuffle_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_shuffle_kernel.h"
 
 PD_REGISTER_KERNEL(
     pixel_shuffle, GPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu
index 9cbbc5072aa..f36c0e45174 100644
--- a/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h"
-#include "paddle/phi/kernels/pixel_unshuffle_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_unshuffle_grad_kernel.h"
 
 PD_REGISTER_KERNEL(pixel_unshuffle_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu b/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu
index ca2e520ffde..54d29ab7b13 100644
--- a/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu
+++ b/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h"
-#include "paddle/phi/kernels/pixel_unshuffle_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h"
+#include "paddle/phi/kernels/pixel_unshuffle_kernel.h"
 
 PD_REGISTER_KERNEL(pixel_unshuffle,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
index a5ab6a1ccd4..832f9ad3118 100644
--- a/paddle/phi/kernels/gpu/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
@@ -12,12 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/pool_grad_kernel.h"
-
-#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
-
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h"
+#include "paddle/phi/kernels/pool_grad_kernel.h"
 
 PD_REGISTER_KERNEL(pool2d_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu
index e8641395bef..ac3718cfb80 100644
--- a/paddle/phi/kernels/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/pool_kernel.cu
@@ -12,12 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/pool_kernel.h"
-
-#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
-
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/pool_kernel_impl.h"
+#include "paddle/phi/kernels/pool_kernel.h"
 
 PD_REGISTER_KERNEL(pool2d,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/prelu_funcs.h b/paddle/phi/kernels/gpu/prelu_funcs.h
index 76ee9439a20..efb22bfadfc 100644
--- a/paddle/phi/kernels/gpu/prelu_funcs.h
+++ b/paddle/phi/kernels/gpu/prelu_funcs.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
index 013ad1974a8..57d1838e904 100644
--- a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/prelu_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/prelu_funcs.h"
+#include "paddle/phi/kernels/prelu_grad_kernel.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 
 namespace phi {
@@ -82,18 +81,18 @@ class PreluOpGradFunctor {
     size_t channel =
         mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1];
 
-    PReluOpGradKernel<
-        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
-        x,
-        alpha,
-        out_grad,
-        x_grad,
-        alpha_grad,
-        channel,
-        plane_size,
-        spatial_size,
-        numel,
-        mode);
+    PReluOpGradKernel<T>
+        <<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+            x,
+            alpha,
+            out_grad,
+            x_grad,
+            alpha_grad,
+            channel,
+            plane_size,
+            spatial_size,
+            numel,
+            mode);
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu
index c4730768982..ad87012485e 100644
--- a/paddle/phi/kernels/gpu/prelu_kernel.cu
+++ b/paddle/phi/kernels/gpu/prelu_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/prelu_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/prelu_funcs.h"
+#include "paddle/phi/kernels/prelu_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
index 45e4730e173..8b58340efd5 100644
--- a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/psroi_pool_kernel.h"
-
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
index f296d0d2074..d392ae7432f 100644
--- a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/psroi_pool_kernel.h"
-
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/psroi_pool_kernel.h"
 
 namespace phi {
 
@@ -207,19 +207,19 @@ void PsroiPoolKernel(const Context& ctx,
   int threads = kNumCUDAThreads;
 
   // call cuda kernel function
-  GPUPSROIPoolForward<T><<<blocks, threads, 0, ctx.stream()>>>(
-      output_size,
-      x.data<T>(),
-      rois.data<T>(),
-      spatial_scale,
-      input_channels,
-      height,
-      width,
-      output_channels,
-      pooled_height,
-      pooled_width,
-      rois_batch_id_list_gpu.data<int>(),
-      ctx.template Alloc<T>(out));
+  GPUPSROIPoolForward<T>
+      <<<blocks, threads, 0, ctx.stream()>>>(output_size,
+                                             x.data<T>(),
+                                             rois.data<T>(),
+                                             spatial_scale,
+                                             input_channels,
+                                             height,
+                                             width,
+                                             output_channels,
+                                             pooled_height,
+                                             pooled_width,
+                                             rois_batch_id_list_gpu.data<int>(),
+                                             ctx.template Alloc<T>(out));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
index f553da361f1..209dd07d950 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
-
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
index d363c0c2836..b52f2ae0ce1 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/put_along_axis_kernel.h"
-
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/put_along_axis_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu
index 90eaea6a086..0882114e846 100644
--- a/paddle/phi/kernels/gpu/randint_kernel.cu
+++ b/paddle/phi/kernels/gpu/randint_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/randint_kernel.h"
-
 #include <random>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/randint_kernel.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index 94f063512c0..d1c8265f2fa 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -16,10 +16,12 @@
 
 #ifdef __NVCC__
 #include <curand_kernel.h>
+
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
+
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index 7f6ecef8087..bb914defbe8 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -24,8 +24,10 @@
 namespace phi {
 
 template <typename T,
-          template <typename> class ReduceOp,
-          template <typename, typename> class TransformOp>
+          template <typename>
+          class ReduceOp,
+          template <typename, typename>
+          class TransformOp>
 void Reduce(const KPDevice& dev_ctx,
             const DenseTensor& x,
             bool reduce_all,
diff --git a/paddle/phi/kernels/gpu/reduce_any_kernel.cu b/paddle/phi/kernels/gpu/reduce_any_kernel.cu
index 39c8cbe442c..25f73c64a54 100644
--- a/paddle/phi/kernels/gpu/reduce_any_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_any_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_any_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_any_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
index b4ff277b502..7ce58bf8b2b 100644
--- a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_max_grad_kernel.h"
 
 PD_REGISTER_KERNEL(max_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu
index 50564a339dd..57a86c63bfc 100644
--- a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_mean_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/reduce_grad.h"
+#include "paddle/phi/kernels/reduce_mean_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu
index ea1d377c459..16914860491 100644
--- a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_min_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_min_grad_kernel.h"
 
 PD_REGISTER_KERNEL(min_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu
index 08444cf95d6..25f5ea33fbf 100644
--- a/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_prod_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_prod_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_prod_grad_kernel.h"
 
 PD_REGISTER_KERNEL(prod_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
index db4ace1a022..4ae1dcfeba0 100644
--- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_prod_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_prod_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
index 8b111641cfa..f5d75b621c0 100644
--- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/reduce_grad.h"
+#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/rmsprop_kernel.cu b/paddle/phi/kernels/gpu/rmsprop_kernel.cu
index 071c09ea675..c49910e88b5 100644
--- a/paddle/phi/kernels/gpu/rmsprop_kernel.cu
+++ b/paddle/phi/kernels/gpu/rmsprop_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/rmsprop_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/rmsprop_kernel_impl.h"
+#include "paddle/phi/kernels/rmsprop_kernel.h"
 
 PD_REGISTER_KERNEL(
     rmsprop, GPU, ALL_LAYOUT, phi::RmspropDenseKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h
index 68d8b2e5eef..fb8e07b8f14 100644
--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/dense_tensor.h"
-
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
index 98c2f618e78..fe044632373 100644
--- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
@@ -14,15 +14,13 @@
 
 #include "paddle/phi/kernels/rnn_grad_kernel.h"
 
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/gpu/rnn_functor.h"
 
-#include "paddle/fluid/operators/utils.h"
-
 namespace phi {
 
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index 5a19d5b89f0..0eb74303f41 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -14,15 +14,13 @@
 
 #include "paddle/phi/kernels/rnn_kernel.h"
 
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/gpu/rnn_functor.h"
 
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/operators/utils.h"
-
 namespace phi {
 
 template <typename T>
diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
index 9f9ea675340..cfb9033bd9c 100644
--- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/roi_align_grad_kernel.h"
-
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/roi_align_grad_kernel.h"
 
 namespace phi {
 
@@ -236,21 +234,21 @@ void RoiAlignGradKernel(const Context& dev_ctx,
   int threads = kNumCUDAThreads;
 
   if (output_grad_size > 0) {
-    GPURoiAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_grad_size,
-        boxes.data<T>(),
-        out_grad.data<T>(),
-        rois_num,
-        spatial_scale,
-        channels,
-        height,
-        width,
-        pooled_height,
-        pooled_width,
-        sampling_ratio,
-        roi_id_data,
-        dx->data<T>(),
-        aligned);
+    GPURoiAlignBackward<T>
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(output_grad_size,
+                                                   boxes.data<T>(),
+                                                   out_grad.data<T>(),
+                                                   rois_num,
+                                                   spatial_scale,
+                                                   channels,
+                                                   height,
+                                                   width,
+                                                   pooled_height,
+                                                   pooled_width,
+                                                   sampling_ratio,
+                                                   roi_id_data,
+                                                   dx->data<T>(),
+                                                   aligned);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu
index fc24179ed3d..0bf96a729fc 100644
--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -12,14 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/roi_align_kernel.h"
-
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/kernels/roi_align_kernel.h"
 
 namespace phi {
 
@@ -232,20 +230,20 @@ void RoiAlignKernel(const Context& dev_ctx,
   int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
   paddle::memory::Copy(
       gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
-  GPURoiAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-      output_size,
-      x.data<T>(),
-      boxes.data<T>(),
-      spatial_scale,
-      channels,
-      height,
-      width,
-      pooled_height,
-      pooled_width,
-      sampling_ratio,
-      roi_id_data,
-      dev_ctx.template Alloc<T>(out),
-      aligned);
+  GPURoiAlignForward<T>
+      <<<blocks, threads, 0, dev_ctx.stream()>>>(output_size,
+                                                 x.data<T>(),
+                                                 boxes.data<T>(),
+                                                 spatial_scale,
+                                                 channels,
+                                                 height,
+                                                 width,
+                                                 pooled_height,
+                                                 pooled_width,
+                                                 sampling_ratio,
+                                                 roi_id_data,
+                                                 dev_ctx.template Alloc<T>(out),
+                                                 aligned);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
index 1a5af93c562..f66d6633b9e 100644
--- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -12,16 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
-
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
 
 namespace phi {
 
@@ -139,20 +137,20 @@ void RoiPoolGradKernel(const Context& dev_ctx,
     int threads = kNumCUDAThreads;
 
     if (output_grad_size > 0) {
-      GPURoiPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          output_grad_size,
-          boxes.data<T>(),
-          out_grad.data<T>(),
-          arg_max.data<int64_t>(),
-          rois_num,
-          spatial_scale,
-          channels,
-          height,
-          width,
-          pooled_height,
-          pooled_width,
-          roi_id_data,
-          dx->data<T>());
+      GPURoiPoolBackward<T>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(output_grad_size,
+                                                     boxes.data<T>(),
+                                                     out_grad.data<T>(),
+                                                     arg_max.data<int64_t>(),
+                                                     rois_num,
+                                                     spatial_scale,
+                                                     channels,
+                                                     height,
+                                                     width,
+                                                     pooled_height,
+                                                     pooled_width,
+                                                     roi_id_data,
+                                                     dx->data<T>());
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
index 32ea6223c9c..4d3576f0c2f 100644
--- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -12,14 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/roi_pool_kernel.h"
-
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/kernels/roi_pool_kernel.h"
 
 namespace phi {
 
@@ -62,18 +60,18 @@ __global__ void GPURoiPoolForward(const int nthreads,
     int box_width = max(box_end_w - box_start_w + 1, 1);
     int box_height = max(box_end_h - box_start_h + 1, 1);
 
-    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
-                                        static_cast<double>(box_height) /
-                                        static_cast<double>(pooled_height)));
-    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
-                                        static_cast<double>(box_width) /
-                                        static_cast<double>(pooled_width)));
-    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
-                                     static_cast<double>(box_height) /
-                                     static_cast<double>(pooled_height)));
-    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
-                                     static_cast<double>(box_width) /
-                                     static_cast<double>(pooled_width)));
+    int hstart = static_cast<int>(
+        floor(static_cast<double>(ph) * static_cast<double>(box_height) /
+              static_cast<double>(pooled_height)));
+    int wstart = static_cast<int>(
+        floor(static_cast<double>(pw) * static_cast<double>(box_width) /
+              static_cast<double>(pooled_width)));
+    int hend = static_cast<int>(
+        ceil(static_cast<double>(ph + 1) * static_cast<double>(box_height) /
+             static_cast<double>(pooled_height)));
+    int wend = static_cast<int>(
+        ceil(static_cast<double>(pw + 1) * static_cast<double>(box_width) /
+             static_cast<double>(pooled_width)));
     hstart = min(max(hstart + box_start_h, 0), height);
     hend = min(max(hend + box_start_h, 0), height);
     wstart = min(max(wstart + box_start_w, 0), width);
@@ -197,19 +195,19 @@ void RoiPoolKernel(const Context& dev_ctx,
   T* output_data = dev_ctx.template Alloc<T>(out);
   int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
 
-  GPURoiPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-      output_size,
-      x.data<T>(),
-      boxes.data<T>(),
-      spatial_scale,
-      channels,
-      height,
-      width,
-      pooled_height,
-      pooled_width,
-      box_id_data,
-      output_data,
-      arg_max_data);
+  GPURoiPoolForward<T>
+      <<<blocks, threads, 0, dev_ctx.stream()>>>(output_size,
+                                                 x.data<T>(),
+                                                 boxes.data<T>(),
+                                                 spatial_scale,
+                                                 channels,
+                                                 height,
+                                                 width,
+                                                 pooled_height,
+                                                 pooled_width,
+                                                 box_id_data,
+                                                 output_data,
+                                                 arg_max_data);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
index 82e0fa72ab0..fc5d4ff5386 100644
--- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/roll_grad_kernel.h"
-
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+#include "paddle/phi/kernels/roll_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu
index 5d3584e4f44..8b137e1a5aa 100644
--- a/paddle/phi/kernels/gpu/roll_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/roll_kernel.h"
-
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/array.h"
 #include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+#include "paddle/phi/kernels/roll_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h
index abe3ee470b4..823164f3fbc 100644
--- a/paddle/phi/kernels/gpu/roll_kernel_impl.h
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
@@ -49,23 +49,22 @@ __global__ void RollCudaKernel(const T* input,
   output[output_idx] = input[idx];
 }
 
-#define CALL_ROLL_CUDA_KERNEL(N)                                              \
-  case N: {                                                                   \
-    phi::Array<int64_t, N> _strides;                                          \
-    phi::Array<int64_t, N> _shifts;                                           \
-    phi::Array<int64_t, N> _sizes;                                            \
-    for (size_t idx = 0; idx < N; ++idx) {                                    \
-      _strides[idx] = strides[idx];                                           \
-      _shifts[idx] = shifts_data[idx];                                        \
-      _sizes[idx] = sizes[idx];                                               \
-    }                                                                         \
-    RollCudaKernel<                                                           \
-        T,                                                                    \
-        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
-             PADDLE_CUDA_NUM_THREADS,                                         \
-             0,                                                               \
-             stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes);  \
-    break;                                                                    \
+#define CALL_ROLL_CUDA_KERNEL(N)                                            \
+  case N: {                                                                 \
+    phi::Array<int64_t, N> _strides;                                        \
+    phi::Array<int64_t, N> _shifts;                                         \
+    phi::Array<int64_t, N> _sizes;                                          \
+    for (size_t idx = 0; idx < N; ++idx) {                                  \
+      _strides[idx] = strides[idx];                                         \
+      _shifts[idx] = shifts_data[idx];                                      \
+      _sizes[idx] = sizes[idx];                                             \
+    }                                                                       \
+    RollCudaKernel<T, N>                                                    \
+        <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
+           PADDLE_CUDA_NUM_THREADS,                                         \
+           0,                                                               \
+           stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes);  \
+    break;                                                                  \
   }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
index 44dc31ed5d9..e2ebfc2ca02 100644
--- a/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/rrelu_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
@@ -21,6 +19,7 @@
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/prelu_funcs.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
+#include "paddle/phi/kernels/rrelu_grad_kernel.h"
 
 namespace phi {
 
@@ -48,9 +47,9 @@ class RReluOpGradFunctor {
                   const T* out_grad,
                   T* x_grad,
                   int numel) {
-    RReluOpGradKernel<
-        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
-        x, noise, out_grad, x_grad, numel);
+    RReluOpGradKernel<T>
+        <<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+            x, noise, out_grad, x_grad, numel);
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu
index 6f96a697b2f..52b28bf37f0 100644
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/scale_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/searchsorted_kernel.cu b/paddle/phi/kernels/gpu/searchsorted_kernel.cu
index 4a2ce2241c2..5976c14d9a9 100644
--- a/paddle/phi/kernels/gpu/searchsorted_kernel.cu
+++ b/paddle/phi/kernels/gpu/searchsorted_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/searchsorted_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/searchsorted_kernel_impl.h"
+#include "paddle/phi/kernels/searchsorted_kernel.h"
 
 PD_REGISTER_KERNEL(searchsorted,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
index 9d1769e18b4..5f636ea7f9d 100644
--- a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
-#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
 
 PD_REGISTER_KERNEL(segment_pool_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
index 3128e534166..4f24cf518d6 100644
--- a/paddle/phi/kernels/gpu/segment_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
-#include "paddle/phi/kernels/segment_pool_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_kernel.h"
 
 PD_REGISTER_KERNEL(segment_pool,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/selu_grad_kernel.cu b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
index 0ed299413c1..c6dffc33ae6 100644
--- a/paddle/phi/kernels/gpu/selu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/selu_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h"
+#include "paddle/phi/kernels/selu_grad_kernel.h"
 
 PD_REGISTER_KERNEL(
     selu_grad, GPU, ALL_LAYOUT, phi::SeluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/selu_kernel.cu b/paddle/phi/kernels/gpu/selu_kernel.cu
index 99303d8c18a..57be1087b44 100644
--- a/paddle/phi/kernels/gpu/selu_kernel.cu
+++ b/paddle/phi/kernels/gpu/selu_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/selu_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/selu_kernel_impl.h"
+#include "paddle/phi/kernels/selu_kernel.h"
 
 PD_REGISTER_KERNEL(selu, GPU, ALL_LAYOUT, phi::SeluKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
index 7eed96699e7..aafb374af63 100644
--- a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/set_value_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h"
+#include "paddle/phi/kernels/set_value_grad_kernel.h"
 
 PD_REGISTER_KERNEL(set_value_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/set_value_kernel.cu b/paddle/phi/kernels/gpu/set_value_kernel.cu
index f788da010b6..b744cfc768e 100644
--- a/paddle/phi/kernels/gpu/set_value_kernel.cu
+++ b/paddle/phi/kernels/gpu/set_value_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/set_value_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/set_value_kernel_impl.h"
+#include "paddle/phi/kernels/set_value_kernel.h"
 
 PD_REGISTER_KERNEL(set_value,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
index d71112a2f28..6d27843f138 100644
--- a/paddle/phi/kernels/gpu/sgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -12,15 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/sgd_kernel.h"
-
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/backends/gpu/gpu_helper.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_helper.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/sgd_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/shard_index_kernel.cu b/paddle/phi/kernels/gpu/shard_index_kernel.cu
index 0bd7b93f689..b20c229a89f 100644
--- a/paddle/phi/kernels/gpu/shard_index_kernel.cu
+++ b/paddle/phi/kernels/gpu/shard_index_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/shard_index_kernel.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/shard_index_kernel.h"
 
 namespace phi {
 
@@ -85,12 +84,12 @@ void ShardIndexKernel(const Context& dev_ctx,
   auto* out_data = dev_ctx.template Alloc<T>(out);
   int64_t numel = in.numel();
   auto stream = dev_ctx.stream();
-  ShardIndexInner<
-      T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-           PADDLE_CUDA_NUM_THREADS,
-           0,
-           stream>>>(
-      in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
+  ShardIndexInner<T>
+      <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+         PADDLE_CUDA_NUM_THREADS,
+         0,
+         stream>>>(
+          in_data, out_data, numel, index_num, nshards, shard_id, ignore_value);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
index 6f9cda83a9a..c300b6d3f3d 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <algorithm>
+
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
index f61cd2c3967..8425f71cc26 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
-
 #include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
index b0e9efe5bba..245ac95eeac 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
-
 #include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
+#include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/sign_kernel.cu.cc b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
index 1fe17a7a227..37f10243dc5 100644
--- a/paddle/phi/kernels/gpu/sign_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sign_kernel.h"
-#include "paddle/phi/kernels/impl/sign_kernel_impl.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/sign_kernel_impl.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/kernels/gpu/size_kernel.cu b/paddle/phi/kernels/gpu/size_kernel.cu
index 7051fb78c75..3ca1a1d6b76 100644
--- a/paddle/phi/kernels/gpu/size_kernel.cu
+++ b/paddle/phi/kernels/gpu/size_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/size_kernel_impl.h"
-#include "paddle/phi/kernels/size_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/size_kernel_impl.h"
+#include "paddle/phi/kernels/size_kernel.h"
 
 PD_REGISTER_KERNEL(size,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/slice_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/slice_grad_kernel.cu.cc
index 2769f5cc65d..a6db80abaee 100644
--- a/paddle/phi/kernels/gpu/slice_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/slice_grad_kernel.cu.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/slice_grad_kernel.h"
-#include "paddle/phi/kernels/impl/slice_grad_kernel_impl.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/slice_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(slice_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/slice_kernel.cu.cc b/paddle/phi/kernels/gpu/slice_kernel.cu.cc
index 0fa61962c9e..8743163b220 100644
--- a/paddle/phi/kernels/gpu/slice_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/slice_kernel.cu.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/slice_kernel.h"
-#include "paddle/phi/kernels/impl/slice_kernel_impl.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/slice_kernel_impl.h"
 
 PD_REGISTER_KERNEL(slice,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
index 04052e0dfc3..fe213e923a9 100644
--- a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/softmax_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h"
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
 
 PD_REGISTER_KERNEL(softmax_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu
index 4a02f438c7e..9415e6b2bad 100644
--- a/paddle/phi/kernels/gpu/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_kernel.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/softmax_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
+#include "paddle/phi/kernels/softmax_kernel.h"
 
 PD_REGISTER_KERNEL(softmax,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index 73b64ce9703..9a854378fb5 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -12,13 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/split_kernel.h"
-
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/split_kernel.h"
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu b/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
index c5a243f45bd..8b2d00c5170 100644
--- a/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/squeeze_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h"
+#include "paddle/phi/kernels/squeeze_grad_kernel.h"
 
 PD_REGISTER_KERNEL(squeeze_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/squeeze_kernel.cu b/paddle/phi/kernels/gpu/squeeze_kernel.cu
index ae15e210a02..6088e384c2e 100644
--- a/paddle/phi/kernels/gpu/squeeze_kernel.cu
+++ b/paddle/phi/kernels/gpu/squeeze_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/squeeze_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/squeeze_kernel_impl.h"
+#include "paddle/phi/kernels/squeeze_kernel.h"
 
 PD_REGISTER_KERNEL(squeeze,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/stack_grad_kernel.cu b/paddle/phi/kernels/gpu/stack_grad_kernel.cu
index 9b754e22692..a24b48e0cf2 100644
--- a/paddle/phi/kernels/gpu/stack_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/stack_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/stack_grad_kernel.h"
-
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/stack_grad_kernel.h"
 
 namespace phi {
 
@@ -105,27 +104,27 @@ void StackGradKernel(const Context& dev_ctx,
       dev_ctx, dy_pre * split_dim * dy_suf);
 
   if (out.numel() < std::numeric_limits<int32_t>::max()) {
-    UnStackHelperCUDAKernel<T, int32_t><<<config.block_per_grid.x,
-                                          config.thread_per_block.x,
-                                          0,
-                                          dev_ctx.stream()>>>(
-        dy_data,
-        dy_pre,
-        split_dim,
-        dy_suf,
-        split_dim,
-        reinterpret_cast<T**>(tmp_out_data->ptr()));
+    UnStackHelperCUDAKernel<T, int32_t>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(dy_data,
+                               dy_pre,
+                               split_dim,
+                               dy_suf,
+                               split_dim,
+                               reinterpret_cast<T**>(tmp_out_data->ptr()));
   } else {
-    UnStackHelperCUDAKernel<T, int64_t><<<config.block_per_grid.x,
-                                          config.thread_per_block.x,
-                                          0,
-                                          dev_ctx.stream()>>>(
-        dy_data,
-        dy_pre,
-        split_dim,
-        dy_suf,
-        split_dim,
-        reinterpret_cast<T**>(tmp_out_data->ptr()));
+    UnStackHelperCUDAKernel<T, int64_t>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(dy_data,
+                               dy_pre,
+                               split_dim,
+                               dy_suf,
+                               split_dim,
+                               reinterpret_cast<T**>(tmp_out_data->ptr()));
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/stack_kernel.cu b/paddle/phi/kernels/gpu/stack_kernel.cu
index cc7d136c952..54bee95356e 100644
--- a/paddle/phi/kernels/gpu/stack_kernel.cu
+++ b/paddle/phi/kernels/gpu/stack_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/stack_kernel.h"
-
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/stack_kernel.h"
 
 namespace phi {
 
@@ -77,25 +76,25 @@ void StackKernel(const Context& dev_ctx,
       phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
 
   if (out->numel() < std::numeric_limits<int32_t>::max()) {
-    StackCUDAKernel<T, int32_t><<<config.block_per_grid,
-                                  config.thread_per_block,
-                                  0,
-                                  dev_ctx.stream()>>>(
-        reinterpret_cast<T**>(tmp_x_data->ptr()),
-        x_col,
-        x_row,
-        out_col,
-        y_data);
+    StackCUDAKernel<T, int32_t>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()),
+                               x_col,
+                               x_row,
+                               out_col,
+                               y_data);
   } else {
-    StackCUDAKernel<T, int64_t><<<config.block_per_grid,
-                                  config.thread_per_block,
-                                  0,
-                                  dev_ctx.stream()>>>(
-        reinterpret_cast<T**>(tmp_x_data->ptr()),
-        x_col,
-        x_row,
-        out_col,
-        y_data);
+    StackCUDAKernel<T, int64_t>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()),
+                               x_col,
+                               x_row,
+                               out_col,
+                               y_data);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/strided_slice_grad_kernel.cu b/paddle/phi/kernels/gpu/strided_slice_grad_kernel.cu
index 90d9f1d9865..cae05b67fef 100644
--- a/paddle/phi/kernels/gpu/strided_slice_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/strided_slice_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/strided_slice_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h"
+#include "paddle/phi/kernels/strided_slice_grad_kernel.h"
 
 PD_REGISTER_KERNEL(strided_slice_raw_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/strided_slice_kernel.cu b/paddle/phi/kernels/gpu/strided_slice_kernel.cu
index 716150ff47d..fa7f5532f55 100644
--- a/paddle/phi/kernels/gpu/strided_slice_kernel.cu
+++ b/paddle/phi/kernels/gpu/strided_slice_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/strided_slice_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/strided_slice_kernel_impl.h"
+#include "paddle/phi/kernels/strided_slice_kernel.h"
 
 PD_REGISTER_KERNEL(strided_slice_raw,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
index e09cfd370a4..61363ce8c49 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
-
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
index 9665a917d9d..cb567ef9915 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/take_along_axis_kernel.h"
-
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/take_along_axis_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/tile_grad_kernel.cu b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
index c092609e623..32e78bbdb7c 100644
--- a/paddle/phi/kernels/gpu/tile_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/tile_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/tile_grad_kernel_impl.h"
+#include "paddle/phi/kernels/tile_grad_kernel.h"
 
 PD_REGISTER_KERNEL(tile_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
index 990877a8445..40ee38f73b3 100644
--- a/paddle/phi/kernels/gpu/tile_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/tile_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/tile_kernel_impl.h"
+#include "paddle/phi/kernels/tile_kernel.h"
 
 PD_REGISTER_KERNEL(tile,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
index 32c5fc0006f..1db05ed18b0 100644
--- a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/top_k_grad_kernel.h"
-
 #include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/top_k_grad_kernel.h"
 
 namespace phi {
 
@@ -71,9 +70,9 @@ void TopkGradKernel(const Context& dev_ctx,
   int grid_size = std::min(max_blocks, pre);
 
   // lanuch the cuda kernel to assign the grad
-  ops::AssignGradWithAxis<
-      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-      out_grad_data, indices_data, x_grad_data, pre, post, n, k);
+  ops::AssignGradWithAxis<T>
+      <<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+          out_grad_data, indices_data, x_grad_data, pre, post, n, k);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
index 8262023826b..b160ad1d1c5 100644
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/top_k_kernel.h"
-
 #include "paddle/fluid/operators/top_k_function_cuda.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/top_k_kernel.h"
 
 namespace phi {
 
@@ -102,25 +101,23 @@ void TopkKernel(const Context& dev_ctx,
       // 1. Gather TopK, but without sorting
       constexpr int max_num_threads = 1024;
       if (largest) {
-        ops::RadixTopK<
-            T,
-            true><<<input_height, max_num_threads, 0, dev_ctx.stream()>>>(
-            input_data,
-            k,
-            input_height,
-            input_width,
-            output_data,
-            indices_data);
+        ops::RadixTopK<T, true>
+            <<<input_height, max_num_threads, 0, dev_ctx.stream()>>>(
+                input_data,
+                k,
+                input_height,
+                input_width,
+                output_data,
+                indices_data);
       } else {
-        ops::RadixTopK<
-            T,
-            false><<<input_height, max_num_threads, 0, dev_ctx.stream()>>>(
-            input_data,
-            k,
-            input_height,
-            input_width,
-            output_data,
-            indices_data);
+        ops::RadixTopK<T, false>
+            <<<input_height, max_num_threads, 0, dev_ctx.stream()>>>(
+                input_data,
+                k,
+                input_height,
+                input_width,
+                output_data,
+                indices_data);
       }
       // 2. Sort if needed
       if (sorted) {
@@ -165,35 +162,31 @@ void TopkKernel(const Context& dev_ctx,
     int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
     switch (ops::GetDesiredBlockDim(input_width)) {
 #ifdef PADDLE_WITH_HIP
-      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
-                      T,
-                      20,
-                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-          output_data,
-          k,
-          indices_data,
-          input_data,
-          input_width,
-          input_width,
-          static_cast<int>(k),
-          gridx,
-          input_height,
-          largest));
+      FIXED_BLOCK_DIM(
+          ops::KeMatrixTopK<T, 20, kBlockDim>
+          <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(output_data,
+                                                      k,
+                                                      indices_data,
+                                                      input_data,
+                                                      input_width,
+                                                      input_width,
+                                                      static_cast<int>(k),
+                                                      gridx,
+                                                      input_height,
+                                                      largest));
 #else
-      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
-                      T,
-                      5,
-                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-          output_data,
-          k,
-          indices_data,
-          input_data,
-          input_width,
-          input_width,
-          static_cast<int>(k),
-          gridx,
-          input_height,
-          largest));
+      FIXED_BLOCK_DIM(
+          ops::KeMatrixTopK<T, 5, kBlockDim>
+          <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(output_data,
+                                                      k,
+                                                      indices_data,
+                                                      input_data,
+                                                      input_width,
+                                                      input_width,
+                                                      static_cast<int>(k),
+                                                      gridx,
+                                                      input_height,
+                                                      largest));
 #endif
       default:
         PADDLE_THROW(errors::Fatal(
@@ -271,35 +264,31 @@ void TopkKernel(const Context& dev_ctx,
     int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
     switch (ops::GetDesiredBlockDim(input_width)) {
 #ifdef PADDLE_WITH_HIP
-      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
-                      T,
-                      20,
-                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-          trans_out.data<T>(),
-          k,
-          trans_ind.data<int64_t>(),
-          trans_input.data<T>(),
-          input_width,
-          input_width,
-          static_cast<int>(k),
-          gridx,
-          input_height,
-          largest));
+      FIXED_BLOCK_DIM(
+          ops::KeMatrixTopK<T, 20, kBlockDim>
+          <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(trans_out.data<T>(),
+                                                      k,
+                                                      trans_ind.data<int64_t>(),
+                                                      trans_input.data<T>(),
+                                                      input_width,
+                                                      input_width,
+                                                      static_cast<int>(k),
+                                                      gridx,
+                                                      input_height,
+                                                      largest));
 #else
-      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
-                      T,
-                      5,
-                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-          trans_out.data<T>(),
-          k,
-          trans_ind.data<int64_t>(),
-          trans_input.data<T>(),
-          input_width,
-          input_width,
-          static_cast<int>(k),
-          gridx,
-          input_height,
-          largest));
+      FIXED_BLOCK_DIM(
+          ops::KeMatrixTopK<T, 5, kBlockDim>
+          <<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(trans_out.data<T>(),
+                                                      k,
+                                                      trans_ind.data<int64_t>(),
+                                                      trans_input.data<T>(),
+                                                      input_width,
+                                                      input_width,
+                                                      static_cast<int>(k),
+                                                      gridx,
+                                                      input_height,
+                                                      largest));
 #endif
       default:
         PADDLE_THROW(errors::Fatal(
diff --git a/paddle/phi/kernels/gpu/trace_grad_kernel.cu b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
index 6692c1e19b0..5b2575f48b8 100644
--- a/paddle/phi/kernels/gpu/trace_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/trace_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/trace_grad_kernel_impl.h"
+#include "paddle/phi/kernels/trace_grad_kernel.h"
 
 PD_REGISTER_KERNEL(trace_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu
index 4a749c5b334..7f22c14c6b6 100644
--- a/paddle/phi/kernels/gpu/trace_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/trace_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/trace_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
index 0687dc0c200..1439b2aa2ab 100644
--- a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
-#include "paddle/phi/kernels/transpose_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
 
 PD_REGISTER_KERNEL(transpose_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu
index 203f10e4ddd..d7daaff2d41 100644
--- a/paddle/phi/kernels/gpu/transpose_kernel.cu
+++ b/paddle/phi/kernels/gpu/transpose_kernel.cu
@@ -14,16 +14,15 @@
 
 #include <vector>
 
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
-
 #include "paddle/fluid/framework/gpu_utils.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace phi {
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
index a48afeb2c79..22629582bed 100644
--- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/triangular_solve_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -21,6 +19,7 @@
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/allocation/allocator.h"
diff --git a/paddle/phi/kernels/gpu/tril_indices_kernel.cu b/paddle/phi/kernels/gpu/tril_indices_kernel.cu
index be83f284511..7068ab1775f 100644
--- a/paddle/phi/kernels/gpu/tril_indices_kernel.cu
+++ b/paddle/phi/kernels/gpu/tril_indices_kernel.cu
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/tril_indices_kernel.h"
-
 #include <algorithm>
 #include <tuple>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/tril_indices_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
index bc3ef1bc623..3271b38ae87 100644
--- a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(tril_triu_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/tril_triu_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_kernel.cu
index 8c48edf9eff..65dcca70584 100644
--- a/paddle/phi/kernels/gpu/tril_triu_kernel.cu
+++ b/paddle/phi/kernels/gpu/tril_triu_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
 
 PD_REGISTER_KERNEL(tril_triu,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
index 33ecb4d6eb5..7c59b39890f 100644
--- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h"
-
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
+
 #include <limits>
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/unbind_kernel.cu b/paddle/phi/kernels/gpu/unbind_kernel.cu
index 8a7aa8f6033..1efc3a1094d 100644
--- a/paddle/phi/kernels/gpu/unbind_kernel.cu
+++ b/paddle/phi/kernels/gpu/unbind_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/unbind_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unbind_kernel_impl.h"
+#include "paddle/phi/kernels/unbind_kernel.h"
 
 PD_REGISTER_KERNEL(unbind,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
index 68e61b73289..b149110bddc 100644
--- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/uniform_random_kernel.h"
-
 #include <thrust/random.h>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/uniform_random_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
index c09730ba76a..fe9304d0e15 100644
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/unique_kernel.h"
-
 #include <thrust/adjacent_difference.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
@@ -21,13 +19,16 @@
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
 #include <thrust/unique.h>
+
 #include <iostream>
 #include <vector>
+
 #include "paddle/fluid/framework/tensor_util.h"  // TensorToVector()
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/unique_functor.h"
+#include "paddle/phi/kernels/unique_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu b/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu
index 6c3a2066f0f..6bd3f307794 100644
--- a/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/unsqueeze_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h"
+#include "paddle/phi/kernels/unsqueeze_grad_kernel.h"
 
 PD_REGISTER_KERNEL(unsqueeze_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/unsqueeze_kernel.cu b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
index 86b44622546..78f1fafda9b 100644
--- a/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
+++ b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/unsqueeze_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unsqueeze_kernel_impl.h"
+#include "paddle/phi/kernels/unsqueeze_kernel.h"
 
 PD_REGISTER_KERNEL(unsqueeze,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/unstack_grad_kernel.cu b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
index b7c349de0df..6245e8258f8 100644
--- a/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/unstack_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
+#include "paddle/phi/kernels/unstack_grad_kernel.h"
 
 PD_REGISTER_KERNEL(unstack_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/unstack_kernel.cu b/paddle/phi/kernels/gpu/unstack_kernel.cu
index f147f4c0f0e..dde6d926b7c 100644
--- a/paddle/phi/kernels/gpu/unstack_kernel.cu
+++ b/paddle/phi/kernels/gpu/unstack_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/unstack_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
+#include "paddle/phi/kernels/unstack_kernel.h"
 
 PD_REGISTER_KERNEL(unstack,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
index 25d6d46c20b..dc04c69ec70 100644
--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -80,7 +80,8 @@ int64_t ComputeBlockSize(int64_t col) {
 }
 
 template <typename Context,
-          template <typename T> typename BinaryFunctor,
+          template <typename T>
+          typename BinaryFunctor,
           typename T>
 struct BinaryOperation {
   void operator()(const Context& dev_ctx,
@@ -89,15 +90,15 @@ struct BinaryOperation {
                   DenseTensor* output) {
     std::vector<const DenseTensor*> ins{&lhs, &rhs};
     std::vector<DenseTensor*> outs{output};
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                   T,
-                                                   T>(
-        dev_ctx, ins, &outs, -1, BinaryFunctor<T>());
+    paddle::operators::
+        LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+            dev_ctx, ins, &outs, -1, BinaryFunctor<T>());
   }
 };
 
 template <typename Context,
-          template <typename InT, typename OutT> typename CompareFunctor,
+          template <typename InT, typename OutT>
+          typename CompareFunctor,
           typename T>
 struct GetMask {
   void operator()(const Context& dev_ctx,
@@ -188,9 +189,8 @@ struct Argmax {
     T* out_data = out->data<T>();
     switch (ComputeBlockSize(width)) {
       FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T,
-                           IndType,
-                           kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+          ArgmaxCUDAKernel<T, IndType, kBlockDim>
+          <<<grid_size, kBlockDim, 0, cu_stream>>>(
               height, width, post, in_data, out_idx_data, out_data));
     }
   }
@@ -206,15 +206,13 @@ struct GetMaxValue {
     dev_ctx.template Alloc<T>(&out_data);
     switch (ComputeBlockSize(input.numel())) {
       FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T,
-                           T,
-                           kBlockDim><<<1, kBlockDim, 0, dev_ctx.stream()>>>(
-              1,
-              input.numel(),
-              1,
-              input.data<int64_t>(),
-              nullptr,
-              out_data.data<int64_t>()));
+          ArgmaxCUDAKernel<T, T, kBlockDim>
+          <<<1, kBlockDim, 0, dev_ctx.stream()>>>(1,
+                                                  input.numel(),
+                                                  1,
+                                                  input.data<int64_t>(),
+                                                  nullptr,
+                                                  out_data.data<int64_t>()));
     }
     DenseTensor max_value_tensor;
     phi::Copy(dev_ctx, out_data, phi::CPUPlace(), false, &max_value_tensor);
diff --git a/paddle/phi/kernels/gpu/warpctc_grad_kernel.cu b/paddle/phi/kernels/gpu/warpctc_grad_kernel.cu
index 612b03555c6..97a75bf6849 100644
--- a/paddle/phi/kernels/gpu/warpctc_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/warpctc_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/warpctc_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h"
+#include "paddle/phi/kernels/warpctc_grad_kernel.h"
 
 PD_REGISTER_KERNEL(
     warpctc_grad, GPU, ALL_LAYOUT, phi::WarpctcGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/warpctc_kernel.cu b/paddle/phi/kernels/gpu/warpctc_kernel.cu
index 3379322f3df..8d93f24b659 100644
--- a/paddle/phi/kernels/gpu/warpctc_kernel.cu
+++ b/paddle/phi/kernels/gpu/warpctc_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/warpctc_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/warpctc_kernel_impl.h"
+#include "paddle/phi/kernels/warpctc_kernel.h"
 
 PD_REGISTER_KERNEL(
     warpctc, GPU, ALL_LAYOUT, phi::WarpctcKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/where_grad_kernel.cu b/paddle/phi/kernels/gpu/where_grad_kernel.cu
index 14cc1d31132..2be698c3455 100644
--- a/paddle/phi/kernels/gpu/where_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_grad_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/where_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/where_grad_kernel.h"
 
 namespace phi {
 
@@ -50,9 +49,9 @@ void WhereGradKernel(const Context& ctx,
 
   auto stream = ctx.stream();
   auto config = backends::gpu::GetGpuLaunchConfig1D(ctx, numel);
-  WhereGradCUDAKernel<
-      T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-      numel, dout, cond_data, dx, dy);
+  WhereGradCUDAKernel<T>
+      <<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+          numel, dout, cond_data, dx, dy);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu
index 3ff73ce8b3b..c16859c52b2 100644
--- a/paddle/phi/kernels/gpu/where_index_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -20,13 +20,12 @@
 namespace cub = hipcub;
 #endif
 
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/select_impl.cu.h"
-#include "paddle/phi/kernels/where_index_kernel.h"
-
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
+#include "paddle/phi/kernels/where_index_kernel.h"
 
 namespace phi {
 template <typename MaskT, typename IndexT, typename OutT>
diff --git a/paddle/phi/kernels/gpu/where_kernel.cu b/paddle/phi/kernels/gpu/where_kernel.cu
index 441be02b99e..c623b6ec8b7 100644
--- a/paddle/phi/kernels/gpu/where_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_kernel.cu
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/where_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/where_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index 53e4c39d8bc..e5552f28f88 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -12,13 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
-
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/operators/conv_miopen_helper.h"
 #else
@@ -28,16 +26,13 @@
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
-#include "paddle/phi/kernels/cpu/conv_util.h"
-#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
-
-#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
-
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 9d4acb95ea4..80100ba8ff4 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -12,14 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/conv_grad_kernel.h"
-
-#include "paddle/phi/core/dense_tensor.h"
-
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/kernels/conv_grad_kernel.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/operators/conv_miopen_helper.h"
 #else
@@ -29,16 +26,13 @@
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
-
+#include "paddle/phi/kernels/funcs/padding.h"
 #include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/float16.h"
-
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index 3d3ab7b7a4e..c746c4db9dc 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/conv_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_kernel.h"
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/operators/conv_miopen_helper.h"
@@ -27,16 +26,13 @@
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
-
+#include "paddle/phi/kernels/funcs/padding.h"
 #include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/float16.h"
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -334,9 +330,9 @@ void ConvCudnnKernel(const Context& ctx,
   paddle::operators::ScalingParamType<T> alpha = 1.0f;
   paddle::operators::ScalingParamType<T> beta = 0.0f;
 
-// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
-// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
-// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
+  // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+  // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+  // VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
 
 #ifdef PADDLE_WITH_HIP
   workspace_handle.RunFunc(
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index 601ac43eeef..6d5a0dd5e0b 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
-
 #include <algorithm>
+
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index ce02a00162b..67a2f381d76 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/conv_transpose_kernel.h"
-
 #include <algorithm>
+
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/padding.h"
 #include "paddle/phi/kernels/funcs/slice.h"
diff --git a/paddle/phi/kernels/gpudnn/pool_gpudnn.h b/paddle/phi/kernels/gpudnn/pool_gpudnn.h
index 0cf2c991464..69fd51b7f0d 100644
--- a/paddle/phi/kernels/gpudnn/pool_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/pool_gpudnn.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
index b731d033470..919a2a2193a 100644
--- a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
@@ -12,15 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/pool_grad_kernel.h"
-
-#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
+#include "paddle/phi/kernels/pool_grad_kernel.h"
 #include "paddle/phi/kernels/pool_kernel.h"
 
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu
index d8f96566775..53e3d7c9426 100644
--- a/paddle/phi/kernels/gpudnn/pool_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/pool_kernel.h"
-
-#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
-
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h"
+#include "paddle/phi/kernels/pool_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 58781e8c6e4..ca3574de771 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -493,14 +493,11 @@ __global__ void WarpSoftmaxBackward(T* dst,
   }
 }
 
-#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT)                      \
-  case Log2Elements:                                                       \
-    WarpSoftmaxForward<T,                                                  \
-                       VecT,                                               \
-                       AccT,                                               \
-                       Log2Elements,                                       \
-                       LogMode><<<blocks, threads, 0, dev_ctx.stream()>>>( \
-        dst, src, batch_size, stride, element_count);                      \
+#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT)        \
+  case Log2Elements:                                         \
+    WarpSoftmaxForward<T, VecT, AccT, Log2Elements, LogMode> \
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(          \
+            dst, src, batch_size, stride, element_count);    \
     break;
 
 /*
@@ -533,14 +530,11 @@ void SwitchWarpSoftmaxForward(const int blocks,
   }
 }
 
-#define SOFTMAX_WARP_BACKWARD_CASE(Log2Elements, AccT)                      \
-  case Log2Elements:                                                        \
-    WarpSoftmaxBackward<T,                                                  \
-                        VecT,                                               \
-                        AccT,                                               \
-                        Log2Elements,                                       \
-                        LogMode><<<blocks, threads, 0, dev_ctx.stream()>>>( \
-        dst, grad, src, batch_size, stride, element_count);                 \
+#define SOFTMAX_WARP_BACKWARD_CASE(Log2Elements, AccT)          \
+  case Log2Elements:                                            \
+    WarpSoftmaxBackward<T, VecT, AccT, Log2Elements, LogMode>   \
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(             \
+            dst, grad, src, batch_size, stride, element_count); \
     break;
 
 /*
@@ -621,7 +615,8 @@ static void GetLaunchConfig(
 
 template <typename T,
           typename AccT,
-          template <typename, typename> class Functor>
+          template <typename, typename>
+          class Functor>
 __global__ void NormalSoftmaxForward(
     T* output, const T* input, int high_dim, int mid_dim, int low_dim) {
   using kMode = kps::details::ReduceMode;
@@ -668,7 +663,8 @@ __global__ void NormalSoftmaxForward(
 
 template <typename T,
           typename AccT,
-          template <typename, typename> class Functor,
+          template <typename, typename>
+          class Functor,
           bool LogMode>
 __global__ void NormalSoftmaxBackward(T* input_grad,
                                       const T* output_grad,
@@ -726,17 +722,13 @@ void LaunchNormalSoftmaxForward(const GPUContext& dev_ctx,
   dim3 grid, block;
   GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block);
   if (LogMode) {
-    NormalSoftmaxForward<
-        T,
-        AccT,
-        LogSoftmaxForwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
-        output_data, input_data, high_dim, mid_dim, low_dim);
+    NormalSoftmaxForward<T, AccT, LogSoftmaxForwardFunctor>
+        <<<grid, block, 0, dev_ctx.stream()>>>(
+            output_data, input_data, high_dim, mid_dim, low_dim);
   } else {
-    NormalSoftmaxForward<
-        T,
-        AccT,
-        SoftmaxForwardFunctor><<<grid, block, 0, dev_ctx.stream()>>>(
-        output_data, input_data, high_dim, mid_dim, low_dim);
+    NormalSoftmaxForward<T, AccT, SoftmaxForwardFunctor>
+        <<<grid, block, 0, dev_ctx.stream()>>>(
+            output_data, input_data, high_dim, mid_dim, low_dim);
   }
 }
 
@@ -752,27 +744,21 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx,
   dim3 grid, block;
   GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block);
   if (LogMode) {
-    NormalSoftmaxBackward<T,
-                          AccT,
-                          LogSoftmaxBackwardFunctor,
-                          LogMode><<<grid, block, 0, dev_ctx.stream()>>>(
-        input_grad_data,
-        output_grad_data,
-        output_data,
-        high_dim,
-        mid_dim,
-        low_dim);
+    NormalSoftmaxBackward<T, AccT, LogSoftmaxBackwardFunctor, LogMode>
+        <<<grid, block, 0, dev_ctx.stream()>>>(input_grad_data,
+                                               output_grad_data,
+                                               output_data,
+                                               high_dim,
+                                               mid_dim,
+                                               low_dim);
   } else {
-    NormalSoftmaxBackward<T,
-                          AccT,
-                          SoftmaxBackwardFunctor,
-                          LogMode><<<grid, block, 0, dev_ctx.stream()>>>(
-        input_grad_data,
-        output_grad_data,
-        output_data,
-        high_dim,
-        mid_dim,
-        low_dim);
+    NormalSoftmaxBackward<T, AccT, SoftmaxBackwardFunctor, LogMode>
+        <<<grid, block, 0, dev_ctx.stream()>>>(input_grad_data,
+                                               output_grad_data,
+                                               output_data,
+                                               high_dim,
+                                               mid_dim,
+                                               low_dim);
   }
 }
 
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
index 45ab645d373..343cba311ae 100644
--- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/softmax_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
index 37175c427ff..b71f39722c9 100644
--- a/paddle/phi/kernels/gpudnn/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/softmax_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+#include "paddle/phi/kernels/softmax_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
index fbb6db358a4..1379e0f542a 100644
--- a/paddle/phi/kernels/graph_send_recv_grad_kernel.h
+++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/utils/optional.h"
 
diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h
index 51768fbc18f..8f635225b75 100644
--- a/paddle/phi/kernels/graph_send_recv_kernel.h
+++ b/paddle/phi/kernels/graph_send_recv_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index 80dba29e76c..58471eb3c8f 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 
-#include "paddle/fluid/platform/device_context.h"
-
 namespace phi {
 
 template <typename T, typename Context, typename Functor>
diff --git a/paddle/phi/kernels/impl/activation_impl.h b/paddle/phi/kernels/impl/activation_impl.h
index 1a62c4e06b5..72117937201 100644
--- a/paddle/phi/kernels/impl/activation_impl.h
+++ b/paddle/phi/kernels/impl/activation_impl.h
@@ -14,12 +14,11 @@
 
 #pragma once
 
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-#include "paddle/fluid/platform/device_context.h"
-
 namespace phi {
 
 #define ToString(x) #x
diff --git a/paddle/phi/kernels/impl/adagrad_kernel_impl.h b/paddle/phi/kernels/impl/adagrad_kernel_impl.h
index ca9fedaf158..1b64da5283c 100644
--- a/paddle/phi/kernels/impl/adagrad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/adagrad_kernel_impl.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/adagrad_kernel.h"
-
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/phi/kernels/adagrad_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
index 9956f07bf0b..bd775110f3a 100644
--- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/addmm_grad_kernel.h"
-
 #include <type_traits>
+
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
diff --git a/paddle/phi/kernels/impl/addmm_kernel_impl.h b/paddle/phi/kernels/impl/addmm_kernel_impl.h
index 3286e31f689..41f3f4b39c9 100644
--- a/paddle/phi/kernels/impl/addmm_kernel_impl.h
+++ b/paddle/phi/kernels/impl/addmm_kernel_impl.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/addmm_kernel.h"
-
 #include <type_traits>
+
+#include "paddle/phi/kernels/addmm_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
diff --git a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
index 0eff1378f41..8b3ced7387a 100644
--- a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/atan2_grad_kernel.h"
-
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/atan2_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h
index 7653032f211..e80256b7254 100644
--- a/paddle/phi/kernels/impl/atan2_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/atan2_kernel.h"
-
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/atan2_kernel.h"
 
 namespace phi {
 template <typename T>
diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
index d7167704a48..0e8b4c216fa 100644
--- a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
+++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
-
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
index b8df86cc693..7ffd69e16ee 100644
--- a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h
@@ -14,9 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/cholesky_grad_kernel.h"
-
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/cholesky_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
index 371644e6434..f68a3e59629 100644
--- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/kernels/cholesky_solve_grad_kernel.h"
-
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
index c039d11635b..1cc8acc21f3 100644
--- a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
-
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
diff --git a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
index 7ce86492327..0e6fc199610 100644
--- a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
@@ -14,13 +14,11 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/clip_kernel.h"
-
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/clip_kernel.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
diff --git a/paddle/phi/kernels/impl/clip_kernel_impl.h b/paddle/phi/kernels/impl/clip_kernel_impl.h
index 17c04c31a59..dc916eb2af8 100644
--- a/paddle/phi/kernels/impl/clip_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_kernel_impl.h
@@ -14,13 +14,11 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/clip_kernel.h"
-
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/clip_kernel.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
diff --git a/paddle/phi/kernels/impl/compare_kernel_impl.h b/paddle/phi/kernels/impl/compare_kernel_impl.h
index 4390c1f8e66..2a8b858856c 100644
--- a/paddle/phi/kernels/impl/compare_kernel_impl.h
+++ b/paddle/phi/kernels/impl/compare_kernel_impl.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/compare_kernel.h"
-
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/compare_kernel.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
index e89920340ff..6d169354cb4 100644
--- a/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/concat_grad_kernel_impl.h
@@ -13,9 +13,8 @@
 // limitations under the License.
 #pragma once
 
-#include "paddle/phi/kernels/concat_grad_kernel.h"
-
 #include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/phi/kernels/concat_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h
index 5cf59fe0192..132eda7596f 100644
--- a/paddle/phi/kernels/impl/conv_cudnn_impl.h
+++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h
@@ -14,12 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/core/dense_tensor.h"
-
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/framework/eigen.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/operators/conv_miopen_helper.h"
 #else
@@ -27,13 +25,12 @@
 #endif
 
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
-#include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/padding.h"
 
 DECLARE_bool(cudnn_deterministic);
 DECLARE_int64(conv_workspace_size_limit);
diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
index d4fd952a670..3fbaf2b2d46 100644
--- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -14,12 +14,11 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
-
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
diff --git a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
index ee2faf761fe..a76545716af 100644
--- a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
@@ -14,12 +14,11 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/conv_transpose_kernel.h"
-
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
index ab1c33d50a4..d9c3333fc24 100644
--- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/determinant_grad_kernel.h"
-
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h
index f3a611b89c9..18fb152b289 100644
--- a/paddle/phi/kernels/impl/determinant_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h
@@ -14,17 +14,15 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/determinant_kernel.h"
-
 #include <Eigen/Dense>
 #include <Eigen/LU>
 #include <algorithm>
 #include <cmath>
 #include <vector>
 
-#include "paddle/phi/core/enforce.h"
-
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/determinant_kernel.h"
 
 namespace phi {
 namespace detail {
diff --git a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
index 92550de1800..49046dfa4d2 100644
--- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <unsupported/Eigen/SpecialFunctions>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
diff --git a/paddle/phi/kernels/impl/digamma_kernel_impl.h b/paddle/phi/kernels/impl/digamma_kernel_impl.h
index 8994979e64d..4547806a38d 100644
--- a/paddle/phi/kernels/impl/digamma_kernel_impl.h
+++ b/paddle/phi/kernels/impl/digamma_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <unsupported/Eigen/SpecialFunctions>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
diff --git a/paddle/phi/kernels/impl/dist_kernel_impl.h b/paddle/phi/kernels/impl/dist_kernel_impl.h
index 397fc1b9224..c4ee7cec347 100644
--- a/paddle/phi/kernels/impl/dist_kernel_impl.h
+++ b/paddle/phi/kernels/impl/dist_kernel_impl.h
@@ -15,8 +15,10 @@
 #pragma once
 
 #include <math.h>
+
 #include <algorithm>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
index ae2e3108502..52d28e481b0 100644
--- a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
@@ -14,13 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
 #include "paddle/phi/kernels/complex_kernel.h"
-
-#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index bfbd6e0c51c..f3521c81ce4 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <set>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
diff --git a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
index 220de197c85..96cf08af963 100644
--- a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/reduce_grad.h"
 
diff --git a/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h b/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h
index 8577a4e3c63..d1de47e128e 100644
--- a/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h
+++ b/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/frobenius_norm_kernel.h"
-
 #include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/frobenius_norm_kernel.h"
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
index 2517d848987..655634e3199 100644
--- a/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
+++ b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <random>
+
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
index 1ae90960ef4..aef55201a2b 100644
--- a/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
index ecd23bbfc1c..851a78b0741 100644
--- a/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <string>
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
diff --git a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
index 9ef6c61fd60..fd1c1dbc8d6 100644
--- a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <unsupported/Eigen/SpecialFunctions>
+
 #include "paddle/phi/kernels/funcs/for_range.h"
 namespace phi {
 template <typename T>
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 5641e7a8274..be32f85fe99 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -14,17 +14,15 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
-#include "paddle/phi/kernels/cpu/reduce.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #endif
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index 3201923e1b2..99257ce4a6a 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 
-#include "paddle/phi/core/dense_tensor.h"
-
 namespace phi {
 
 static void GetBroadcastFromDims(const int x_ndim,
diff --git a/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
index 546ea746742..4d551b3d822 100644
--- a/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/maxout_grad_kernel.h"
-
 #include "paddle/fluid/operators/math/maxouting.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/maxout_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/maxout_kernel_impl.h b/paddle/phi/kernels/impl/maxout_kernel_impl.h
index da8c259ebf2..529534d11c8 100644
--- a/paddle/phi/kernels/impl/maxout_kernel_impl.h
+++ b/paddle/phi/kernels/impl/maxout_kernel_impl.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/maxout_kernel.h"
-
 #include "paddle/fluid/operators/math/maxouting.h"
+#include "paddle/phi/kernels/maxout_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h b/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h
index b31fc5ac348..386bb1b47ef 100644
--- a/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/meshgrid_grad_kernel.h"
-
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/meshgrid_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/meshgrid_kernel_impl.h b/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
index 9167cab978a..e5e7f785b81 100644
--- a/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
+++ b/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
@@ -14,13 +14,12 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/meshgrid_kernel.h"
-
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/meshgrid_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h
index 825a3b9d569..93e5e957fd4 100644
--- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
+++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
@@ -14,14 +14,13 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/momentum_kernel.h"
-
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/momentum_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h b/paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h
index f5743412762..8a30082ac36 100644
--- a/paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/nanmedian_grad_kernel_impl.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/nanmedian_kernel_impl.h b/paddle/phi/kernels/impl/nanmedian_kernel_impl.h
index 57e9e5646e5..0d3585eb1ce 100644
--- a/paddle/phi/kernels/impl/nanmedian_kernel_impl.h
+++ b/paddle/phi/kernels/impl/nanmedian_kernel_impl.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/nanmedian_kernel.h"
-
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/nanmedian_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/pad_kernel_impl.h b/paddle/phi/kernels/impl/pad_kernel_impl.h
index 8e3ebb0dfe0..c4ff32f8b32 100644
--- a/paddle/phi/kernels/impl/pad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pad_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <utility>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/padding.h"
 namespace phi {
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
index f71f6cd990a..4e409132862 100644
--- a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
index c5e41b49029..4a0ebc148ac 100644
--- a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
index 7fe89ce34c8..a816deaeb04 100644
--- a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/pool_grad_kernel.h"
-
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/pool_grad_kernel.h"
 #include "paddle/phi/kernels/pool_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h
index 665d02fd017..fb93fc1ce66 100644
--- a/paddle/phi/kernels/impl/pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pool_kernel_impl.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/pool_kernel.h"
-
 #include <algorithm>
+
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/pool_kernel.h"
 
 #if defined(__HIPCC__) || defined(__NVCC__)
 #include "paddle/phi/kernels/funcs/reduce_function.h"
diff --git a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
index 9051ef68459..83dd4a2b576 100644
--- a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/reduce_max_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/reduce_grad.h"
+#include "paddle/phi/kernels/reduce_max_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
index 53bd0b7d57f..592b5309cd9 100644
--- a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/reduce_min_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/reduce_grad.h"
+#include "paddle/phi/kernels/reduce_min_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
index 3bf103b0fda..69775281a25 100644
--- a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/reduce_prod_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/reduce_grad.h"
+#include "paddle/phi/kernels/reduce_prod_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/reverse_kernel_impl.h b/paddle/phi/kernels/impl/reverse_kernel_impl.h
index acdd46a0865..16ee333f83f 100644
--- a/paddle/phi/kernels/impl/reverse_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reverse_kernel_impl.h
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/reverse_kernel.h"
-
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/reverse_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/rmsprop_kernel_impl.h b/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
index 1954c5f20db..a01d4ba3aea 100644
--- a/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
+++ b/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
@@ -16,12 +16,11 @@
 
 #include <math.h>
 
-#include "paddle/phi/kernels/rmsprop_kernel.h"
-
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/rmsprop_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
index bd0ba26b99a..e75c3e980ef 100644
--- a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
index 8a6df37ab3e..c1671a1b37a 100644
--- a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/selu_grad_kernel_impl.h b/paddle/phi/kernels/impl/selu_grad_kernel_impl.h
index d09c87b0a4e..4f6550b9bec 100644
--- a/paddle/phi/kernels/impl/selu_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/selu_grad_kernel_impl.h
@@ -13,9 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/kernels/impl/selu_kernel_impl.h"
-
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/impl/selu_kernel_impl.h"
 
 namespace phi {
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/impl/selu_kernel_impl.h b/paddle/phi/kernels/impl/selu_kernel_impl.h
index 888bac42bfd..288f7bb9b79 100644
--- a/paddle/phi/kernels/impl/selu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/selu_kernel_impl.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <string>
+
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
index 96660c7084b..40543645b01 100644
--- a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
@@ -16,7 +16,6 @@
 
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h
index 229dcf671f9..4859a7348e5 100644
--- a/paddle/phi/kernels/impl/set_value_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h
@@ -17,7 +17,6 @@
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
diff --git a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
index a5c67a32553..1a6d64ee58a 100644
--- a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/slice_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"
+#include "paddle/phi/kernels/slice_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h b/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
index 915bf16a92d..19df20c0d7c 100644
--- a/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/softmax_grad_kernel.h"
-
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/softmax_kernel_impl.h b/paddle/phi/kernels/impl/softmax_kernel_impl.h
index 7aa43fdb7f2..aa0ebf2570c 100644
--- a/paddle/phi/kernels/impl/softmax_kernel_impl.h
+++ b/paddle/phi/kernels/impl/softmax_kernel_impl.h
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/softmax_kernel.h"
-
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/softmax_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h b/paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h
index 95780682c98..f8b604ef117 100644
--- a/paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/strided_slice_grad_kernel_impl.h
@@ -13,9 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/kernels/strided_slice_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/strided_slice.h"
+#include "paddle/phi/kernels/strided_slice_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/strided_slice_kernel_impl.h b/paddle/phi/kernels/impl/strided_slice_kernel_impl.h
index 81e6d505626..5d6c3d8992c 100644
--- a/paddle/phi/kernels/impl/strided_slice_kernel_impl.h
+++ b/paddle/phi/kernels/impl/strided_slice_kernel_impl.h
@@ -13,9 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/kernels/strided_slice_kernel.h"
-
 #include "paddle/phi/kernels/funcs/strided_slice.h"
+#include "paddle/phi/kernels/strided_slice_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
index 30f2d5a05cd..3ea75b036a5 100644
--- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -14,8 +14,6 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/triangular_solve_grad_kernel.h"
-
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -24,6 +22,7 @@
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/matrix_reduce.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+#include "paddle/phi/kernels/triangular_solve_grad_kernel.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
index dcc7224b507..91dbde04aca 100644
--- a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/tril_triu_grad_kernel.h"
-
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+#include "paddle/phi/kernels/tril_triu_grad_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
index 959169d87ce..24c032893c3 100644
--- a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/tril_triu_kernel.h"
-
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+#include "paddle/phi/kernels/tril_triu_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
index 0724cffdd44..66fa2a4dc04 100644
--- a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/unfold_kernel_impl.h b/paddle/phi/kernels/impl/unfold_kernel_impl.h
index 4526d1c3dcd..3b75e149f48 100644
--- a/paddle/phi/kernels/impl/unfold_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unfold_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/kldiv_loss_kernel.h b/paddle/phi/kernels/kldiv_loss_kernel.h
index 103780ab747..7c6cc231c94 100644
--- a/paddle/phi/kernels/kldiv_loss_kernel.h
+++ b/paddle/phi/kernels/kldiv_loss_kernel.h
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/kps/compare_kernel.cu b/paddle/phi/kernels/kps/compare_kernel.cu
index c315ce2fa9d..0b0990627f0 100644
--- a/paddle/phi/kernels/kps/compare_kernel.cu
+++ b/paddle/phi/kernels/kps/compare_kernel.cu
@@ -20,7 +20,9 @@
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #else
 #include <thrust/fill.h>
+
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/compare_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
diff --git a/paddle/phi/kernels/kps/reduce_all_kernel.cu b/paddle/phi/kernels/kps/reduce_all_kernel.cu
index dc6355a213f..0459acd9822 100644
--- a/paddle/phi/kernels/kps/reduce_all_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_all_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_all_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_all_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/kps/reduce_max_kernel.cu b/paddle/phi/kernels/kps/reduce_max_kernel.cu
index dd63b05bda1..bc997c6c4e3 100644
--- a/paddle/phi/kernels/kps/reduce_max_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_max_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_max_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/kps/reduce_mean_kernel.cu b/paddle/phi/kernels/kps/reduce_mean_kernel.cu
index 8e4a65df122..c4ecd4380c3 100644
--- a/paddle/phi/kernels/kps/reduce_mean_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_mean_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_mean_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_mean_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/kps/reduce_min_kernel.cu b/paddle/phi/kernels/kps/reduce_min_kernel.cu
index 59d69c29dec..6fea48b588a 100644
--- a/paddle/phi/kernels/kps/reduce_min_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_min_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_min_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_min_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/kps/reduce_sum_kernel.cu b/paddle/phi/kernels/kps/reduce_sum_kernel.cu
index e800e4685ec..f219abd3348 100644
--- a/paddle/phi/kernels/kps/reduce_sum_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_sum_kernel.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_sum_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/masked_select_grad_kernel.h b/paddle/phi/kernels/masked_select_grad_kernel.h
index db7d105093d..f8aa06024c5 100644
--- a/paddle/phi/kernels/masked_select_grad_kernel.h
+++ b/paddle/phi/kernels/masked_select_grad_kernel.h
@@ -24,4 +24,4 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
                             const DenseTensor& out_grad,
                             DenseTensor* x_grad);
 
-}  // namspace phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/masked_select_kernel.h b/paddle/phi/kernels/masked_select_kernel.h
index 471f650690d..d6fef8569d7 100644
--- a/paddle/phi/kernels/masked_select_kernel.h
+++ b/paddle/phi/kernels/masked_select_kernel.h
@@ -23,4 +23,4 @@ void MaskedSelectKernel(const Context& dev_ctx,
                         const DenseTensor& mask,
                         DenseTensor* out);
 
-}  // namspace phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/matmul_kernel.h b/paddle/phi/kernels/matmul_kernel.h
index a4c4971499f..7f4de8d5792 100644
--- a/paddle/phi/kernels/matmul_kernel.h
+++ b/paddle/phi/kernels/matmul_kernel.h
@@ -16,7 +16,6 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
-
 #include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/mv_kernel.h b/paddle/phi/kernels/mv_kernel.h
index ab4f0b82794..df4626f4d49 100644
--- a/paddle/phi/kernels/mv_kernel.h
+++ b/paddle/phi/kernels/mv_kernel.h
@@ -24,4 +24,4 @@ void MvKernel(const Context& ctx,
               const DenseTensor& vec,
               DenseTensor* out);
 
-}  // namepsace phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/pixel_shuffle_grad_kernel.h b/paddle/phi/kernels/pixel_shuffle_grad_kernel.h
index be57de5da40..c42731d354b 100644
--- a/paddle/phi/kernels/pixel_shuffle_grad_kernel.h
+++ b/paddle/phi/kernels/pixel_shuffle_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/pixel_shuffle_kernel.h b/paddle/phi/kernels/pixel_shuffle_kernel.h
index 18b9ab9c21f..bf7c9f07224 100644
--- a/paddle/phi/kernels/pixel_shuffle_kernel.h
+++ b/paddle/phi/kernels/pixel_shuffle_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/pixel_unshuffle_grad_kernel.h b/paddle/phi/kernels/pixel_unshuffle_grad_kernel.h
index 868633e56be..43919d9e63f 100644
--- a/paddle/phi/kernels/pixel_unshuffle_grad_kernel.h
+++ b/paddle/phi/kernels/pixel_unshuffle_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/pixel_unshuffle_kernel.h b/paddle/phi/kernels/pixel_unshuffle_kernel.h
index 179e2b6639f..f91326d384c 100644
--- a/paddle/phi/kernels/pixel_unshuffle_kernel.h
+++ b/paddle/phi/kernels/pixel_unshuffle_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/pool_grad_kernel.h b/paddle/phi/kernels/pool_grad_kernel.h
index 0658dc22c82..d26bee2eb2c 100644
--- a/paddle/phi/kernels/pool_grad_kernel.h
+++ b/paddle/phi/kernels/pool_grad_kernel.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/pool_kernel.h b/paddle/phi/kernels/pool_kernel.h
index 348af021815..b9a4c830fa5 100644
--- a/paddle/phi/kernels/pool_kernel.h
+++ b/paddle/phi/kernels/pool_kernel.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index fabc6c0d13e..b5df98671f0 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -487,28 +487,28 @@ __device__ __forceinline__ void Reduce(T* out,
 }
 
 /*
-* @brief Fill register with a constant according to OpFunc
-*
-* @template paraments
-* InT: The data type of in1 and in2.
-* OutT: The data type of out.
-* NX: The number of data columns loaded by each thread.
-* NY: The number of data rows loaded by each thread.
-* BlockSize: Identifies the current device thread index method. Currently only
-* GPU was supported.
-* OpFunc: Compute functor which has an operator() as following
-*     template <typename InT>
-*     struct XxxFunctor {
-*       HOSTDEVICE InT operator()()
-* const {
-*         return a;
-*       }
-*     };
-*
-* @param
-* out: The register pointer of out, the size is NX * NY.
-* compute: Compute function which was declared like OpFunc<InT>().
-*/
+ * @brief Fill register with a constant according to OpFunc
+ *
+ * @template paraments
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. Currently only
+ * GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename InT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()()
+ * const {
+ *         return a;
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT>().
+ */
 template <typename InT,
           typename OutT,
           int NX,
@@ -523,31 +523,31 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
 }
 
 /*
-* @brief Get ReturnsCount random data fromm compute according to state, state
-* can be curandStatePhilox4_32_10_t, hiprandStatePhilox4_32_10_t which has beed
-* initialized.
-*
-* @template paraments
-* StateType: the type of state, can be curandStatePhilox4_32_10_t or
-* hiprandStatePhilox4_32_10_t.
-* OutT: the type of out register.
-* ReturnsCount: The number of random data generated by OpFunc.
-* BlockSize: Identifies the current device thread index method. Currently only
-* GPU was supported.
-* OpFunc: Compute functor which has an operator() as following
-*     template <typename T>
-*     struct XxxFunctor {
-*       HOSTDEVICE InT operator()(StateType state)
-* const {
-*         return ranomd(state);  // Returns ReturnsCount random numbers with
-* data type T
-*       }
-*     };
-*
-* @param
-* out: The register pointer of out, the size is NX * NY.
-* compute: Compute function which was declared like OpFunc<T>().
-*/
+ * @brief Get ReturnsCount random data fromm compute according to state, state
+ * can be curandStatePhilox4_32_10_t, hiprandStatePhilox4_32_10_t which has beed
+ * initialized.
+ *
+ * @template paraments
+ * StateType: the type of state, can be curandStatePhilox4_32_10_t or
+ * hiprandStatePhilox4_32_10_t.
+ * OutT: the type of out register.
+ * ReturnsCount: The number of random data generated by OpFunc.
+ * BlockSize: Identifies the current device thread index method. Currently only
+ * GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename T>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()(StateType state)
+ * const {
+ *         return ranomd(state);  // Returns ReturnsCount random numbers with
+ * data type T
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<T>().
+ */
 
 template <typename StateType,
           typename OutT,
@@ -565,28 +565,28 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
 }
 
 /*
-* @brief Complete the prefix and in the block, each thread calculates 2 data,
-* the size of out and in is 2, and BlockDim.x must be less then 512.
-*
-* @template paraments
-* InT: the type of input register.
-* OutT: the type of out register.
-* BlockSize: Identifies the current device thread index method. Currently only
-* GPU was supported.
-* OpFunc: Compute functor which has an operator() as following
-*     template <typename T>
-*     struct XxxFunctor {
-*       HOSTDEVICE InT operator()(T a, T b)
-* const {
-*         return a + b;
-*       }
-*     };
-*
-* @param
-* out: The register pointer of out, the size is 2;
-* in: The register pointer of input, the size is 2;
-* compute: Compute function which was declared like OpFunc<T>().
-*/
+ * @brief Complete the prefix and in the block, each thread calculates 2 data,
+ * the size of out and in is 2, and BlockDim.x must be less then 512.
+ *
+ * @template paraments
+ * InT: the type of input register.
+ * OutT: the type of out register.
+ * BlockSize: Identifies the current device thread index method. Currently only
+ * GPU was supported.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename T>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()(T a, T b)
+ * const {
+ *         return a + b;
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is 2;
+ * in: The register pointer of input, the size is 2;
+ * compute: Compute function which was declared like OpFunc<T>().
+ */
 
 #define SHARED_SIZE_LIMIT 512
 template <typename InT, typename OutT, int BlockSize, class OpFunc>
@@ -626,22 +626,22 @@ __device__ __forceinline__ void Cumsum(OutT* out,
 #undef SHARED_SIZE_LIMIT
 
 /*
-* @brief Sort data in this block, each thread calculates 2 data, the size of out
-* and in is 2, and BlockDim.x must be less then 512.
-*
-* @template paraments
-* InT: the type of input register.
-* OutT: the type of out register.
-* BlockSize: Identifies the current device thread index method. Currently only
-* GPU was supported.
-*
-* @param
-* out: The register pointer of out, the size is 2.
-* in: The register pointer of input, the size is 2.
-* num: The num of this block
-* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
-* sorted in escending.
-*/
+ * @brief Sort data in this block, each thread calculates 2 data, the size of
+ * out and in is 2, and BlockDim.x must be less then 512.
+ *
+ * @template paraments
+ * InT: the type of input register.
+ * OutT: the type of out register.
+ * BlockSize: Identifies the current device thread index method. Currently only
+ * GPU was supported.
+ *
+ * @param
+ * out: The register pointer of out, the size is 2.
+ * in: The register pointer of input, the size is 2.
+ * num: The num of this block
+ * monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
+ * sorted in escending.
+ */
 #define SHARED_SIZE_LIMIT 1024
 // each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
 // larger than blockDim.x * 2
@@ -682,25 +682,25 @@ __device__ __forceinline__ void Sort(OutT* out,
 }
 
 /*
-* @brief Sort data with data_index in this block, each thread calculates 2 data,
-* the size of out and in is 2, and BlockDim.x must be less then 512.
-*
-* @template paraments
-* InT: The type of input register.
-* OutT: The type of out register.
-* IndexType: The type of index.
-* BlockSize: Identifies the current device thread index method. Currently only
-* GPU was supported.
-*
-* @param
-* out: The register pointer of out, the size is 2.
-* out_index: The register pointer of out_index, the size is 2.
-* in: The register pointer of input, the size is 2.
-* in_index: The register pointer of in_index, the size is 2.
-* num: The num of this block.
-* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
-* sorted in escending.
-*/
+ * @brief Sort data with data_index in this block, each thread calculates 2
+ * data, the size of out and in is 2, and BlockDim.x must be less then 512.
+ *
+ * @template paraments
+ * InT: The type of input register.
+ * OutT: The type of out register.
+ * IndexType: The type of index.
+ * BlockSize: Identifies the current device thread index method. Currently only
+ * GPU was supported.
+ *
+ * @param
+ * out: The register pointer of out, the size is 2.
+ * out_index: The register pointer of out_index, the size is 2.
+ * in: The register pointer of input, the size is 2.
+ * in_index: The register pointer of in_index, the size is 2.
+ * num: The num of this block.
+ * monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
+ * sorted in escending.
+ */
 template <typename InT, typename OutT, typename IndexType, int BlockSize>
 __device__ __forceinline__ void Sort(OutT* out,
                                      IndexType* out_index,
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index eb45def836e..6ec05ee5054 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -361,28 +361,28 @@ __device__ __forceinline__ void Reduce(T* out,
 }
 
 /*
-* @brief Fill register with a constant according to OpFunc
-*
-* @template paraments
-* InT: The data type of in1 and in2.
-* OutT: The data type of out.
-* NX: The number of data columns loaded by each thread.
-* NY: The number of data rows loaded by each thread.
-* BlockSize: Identifies the current device thread index method. For xpu,
-* core_id() is used as the index.
-* OpFunc: Compute functor which has an operator() as following
-*     template <typename InT>
-*     struct XxxFunctor {
-*       HOSTDEVICE InT operator()()
-* const {
-*         return a;
-*       }
-*     };
-*
-* @param
-* out: The register pointer of out, the size is NX * NY.
-* compute: Compute function which was declared like OpFunc<InT>().
-*/
+ * @brief Fill register with a constant according to OpFunc
+ *
+ * @template paraments
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename InT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()()
+ * const {
+ *         return a;
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT>().
+ */
 template <typename InT,
           typename OutT,
           int NX,
diff --git a/paddle/phi/kernels/reshape_grad_kernel.cc b/paddle/phi/kernels/reshape_grad_kernel.cc
index 129a69d4e4e..35f85ba86aa 100644
--- a/paddle/phi/kernels/reshape_grad_kernel.cc
+++ b/paddle/phi/kernels/reshape_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/reshape_grad_kernel.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/reshape_kernel.cc b/paddle/phi/kernels/reshape_kernel.cc
index efcad999b44..a723ea19d34 100644
--- a/paddle/phi/kernels/reshape_kernel.cc
+++ b/paddle/phi/kernels/reshape_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/reshape_kernel.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.cc b/paddle/phi/kernels/selected_rows/activation_kernel.cc
index 438a080a635..4a27d0763a2 100644
--- a/paddle/phi/kernels/selected_rows/activation_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc
@@ -14,12 +14,10 @@
 
 #include "paddle/phi/kernels/selected_rows/activation_kernel.h"
 
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/activation_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/activation_kernel.h"
 
 namespace phi {
 namespace sr {
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index 03cd7fed411..14987bc61b1 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -18,11 +18,10 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/full_kernel.h"
-
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 namespace phi {
 namespace sr {
diff --git a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
index 18b6da818a1..ec9fed6e3d9 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/selected_rows/adam_kernel.h"
-
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -23,6 +21,7 @@
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/selected_rows/adam_kernel.h"
 
 namespace phi {
 namespace sr {
@@ -208,28 +207,28 @@ void AdamDenseParamSparseGradKernel(
     int ndim = param.numel();
     int blocks = (ndim + threads - 1) / threads;
 
-    SparseAdamCUDAKernelREG<T,
-                            MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        beta1_,
-        beta2_,
-        epsilon_,
-        *beta1_pow.data<MPDType>(),
-        *beta2_pow.data<MPDType>(),
-        moment1.data<MPDType>(),
-        dev_ctx.template Alloc<MPDType>(moment1_out),
-        moment2.data<MPDType>(),
-        dev_ctx.template Alloc<MPDType>(moment2_out),
-        learning_rate.data<MPDType>(),
-        grad_data,
-        param.data<T>(),
-        dev_ctx.template Alloc<T>(param_out),
-        master_in_data,
-        master_out_data,
-        rows,
-        row_numel,
-        grad_merge.rows().size(),
-        lazy_mode,
-        ndim);
+    SparseAdamCUDAKernelREG<T, MPDType>
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(
+            beta1_,
+            beta2_,
+            epsilon_,
+            *beta1_pow.data<MPDType>(),
+            *beta2_pow.data<MPDType>(),
+            moment1.data<MPDType>(),
+            dev_ctx.template Alloc<MPDType>(moment1_out),
+            moment2.data<MPDType>(),
+            dev_ctx.template Alloc<MPDType>(moment2_out),
+            learning_rate.data<MPDType>(),
+            grad_data,
+            param.data<T>(),
+            dev_ctx.template Alloc<T>(param_out),
+            master_in_data,
+            master_out_data,
+            rows,
+            row_numel,
+            grad_merge.rows().size(),
+            lazy_mode,
+            ndim);
     if (!use_global_beta_pow) {
       // Update with cpu
       dev_ctx.template HostAlloc<MPDType>(beta1_pow_out)[0] =
diff --git a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
index 182c4390b17..35a349a277d 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/selected_rows/adamw_kernel.h"
-
 #include <math.h>  // for sqrt in CPU and CUDA
+
 #include <vector>
 
 #include "paddle/fluid/framework/tensor_util.h"
@@ -26,6 +25,7 @@
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/selected_rows/adamw_kernel.h"
 
 namespace phi {
 namespace sr {
@@ -230,30 +230,30 @@ void AdamwDenseParamSparseGradKernel(
     int ndim = param.numel();
     int blocks = (ndim + threads - 1) / threads;
 
-    SparseAdamWCUDAKernelREG<T,
-                             MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        beta1_,
-        beta2_,
-        epsilon_,
-        coeff_,
-        lr_ratio_,
-        *beta1_pow.data<MPDType>(),
-        *beta2_pow.data<MPDType>(),
-        moment1.data<MPDType>(),
-        dev_ctx.template Alloc<MPDType>(moment1_out),
-        moment2.data<MPDType>(),
-        dev_ctx.template Alloc<MPDType>(moment2_out),
-        learning_rate.data<MPDType>(),
-        grad_data,
-        param.data<T>(),
-        dev_ctx.template Alloc<T>(param_out),
-        master_in_data,
-        master_out_data,
-        rows,
-        row_numel,
-        grad_merge.rows().size(),
-        lazy_mode,
-        ndim);
+    SparseAdamWCUDAKernelREG<T, MPDType>
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(
+            beta1_,
+            beta2_,
+            epsilon_,
+            coeff_,
+            lr_ratio_,
+            *beta1_pow.data<MPDType>(),
+            *beta2_pow.data<MPDType>(),
+            moment1.data<MPDType>(),
+            dev_ctx.template Alloc<MPDType>(moment1_out),
+            moment2.data<MPDType>(),
+            dev_ctx.template Alloc<MPDType>(moment2_out),
+            learning_rate.data<MPDType>(),
+            grad_data,
+            param.data<T>(),
+            dev_ctx.template Alloc<T>(param_out),
+            master_in_data,
+            master_out_data,
+            rows,
+            row_numel,
+            grad_merge.rows().size(),
+            lazy_mode,
+            ndim);
     if (!use_global_beta_pow) {
       // Update with cpu
       dev_ctx.template HostAlloc<MPDType>(beta1_pow_out)[0] =
diff --git a/paddle/phi/kernels/selected_rows/gpu/clip_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/clip_kernel.cu
index a8d659559e1..b9f4febb3b4 100644
--- a/paddle/phi/kernels/selected_rows/gpu/clip_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/clip_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/selected_rows/clip_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/selected_rows/clip_kernel.h"
 #include "paddle/phi/kernels/selected_rows/impl/clip_kernel_impl.h"
 
 PD_REGISTER_KERNEL(clip_sr,
diff --git a/paddle/phi/kernels/selected_rows/impl/clip_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/clip_kernel_impl.h
index 1d95e633b93..c39d3867467 100644
--- a/paddle/phi/kernels/selected_rows/impl/clip_kernel_impl.h
+++ b/paddle/phi/kernels/selected_rows/impl/clip_kernel_impl.h
@@ -14,13 +14,12 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/selected_rows/clip_kernel.h"
-
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/kernels/selected_rows/clip_kernel.h"
 
 namespace phi {
 namespace sr {
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
index 67126d82042..575bcc0d09f 100644
--- a/paddle/phi/kernels/selected_rows/shape_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/selected_rows/shape_kernel.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/kernels/selu_kernel.h b/paddle/phi/kernels/selu_kernel.h
index cd5d27e98cc..b8130d23986 100644
--- a/paddle/phi/kernels/selu_kernel.h
+++ b/paddle/phi/kernels/selu_kernel.h
@@ -25,4 +25,4 @@ void SeluKernel(const Context& dev_ctx,
                 float scale,
                 float alpha,
                 DenseTensor* out);
-}  // phi
+}  // namespace phi
diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc
index f87b5014c12..ea48ea6171e 100644
--- a/paddle/phi/kernels/shape_kernel.cc
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/shape_kernel.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
index 22c5e14b35f..9d1f71afceb 100644
--- a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/coalesced_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index 34337db558c..5a981fb8df3 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index d133464ab85..1b95de890de 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 
 namespace phi {
 namespace sparse {
@@ -25,7 +25,7 @@ namespace sparse {
  * x: (N, D, H, W, C)
  * kernel: (D, H, W, C, OC)
  * out: (N, D, H, W, OC)
-**/
+ **/
 template <typename T, typename IntT = int>
 void Conv3dCPUKernel(const CPUContext& dev_ctx,
                      const SparseCooTensor& x,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
index 0e5714b1743..37579ae8564 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
+
+#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -22,8 +24,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 namespace sparse {
 
@@ -73,7 +73,7 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx,
  * @brief Filter the DenseTensor x by the
  * mask.non_zero_indices() and output a SparseCooTensor
  * x and mask must have the same shape.
-**/
+ **/
 template <typename T, typename Context>
 void SparseMaskKernel(const Context& dev_ctx,
                       const DenseTensor& x,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
index 71a00953955..fdf8e5aa7eb 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
index 28211a1cda3..7655913374d 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/visit_type.h"
@@ -27,7 +28,7 @@ namespace sparse {
  * x: (N, D, H, W, C)
  * kernel: (D, H, W, C, OC)
  * out: (N, D, H, W, OC)
-**/
+ **/
 template <typename T, typename IntT = int>
 void MaxPoolCPUKernel(const CPUContext& dev_ctx,
                       const SparseCooTensor& x,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 2301d31d7a6..28b1b3368ed 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
+
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
index b2e7884580c..7d9e566916a 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
@@ -133,17 +133,15 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
 
   // 5. scatter the values
   config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
-  phi::funcs::sparse::ScatterKernel<T><<<config.block_per_grid,
-                                         config.thread_per_block,
-                                         0,
-                                         dev_ctx.stream()>>>(
-      x_values_ptr,
-      public_indexs.data<int>(),
-      values_indexs_ptr,
-      out_nnz,
-      nnz,
-      stride,
-      out_values.data<T>());
+  phi::funcs::sparse::ScatterKernel<T>
+      <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
+          x_values_ptr,
+          public_indexs.data<int>(),
+          values_indexs_ptr,
+          out_nnz,
+          nnz,
+          stride,
+          out_values.data<T>());
 
   // 6. convert index to coordinate
   Dim<DDim::kMaxRank> const_dims;
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index fcbb3c60183..24a7387d4fe 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -45,7 +45,7 @@ using Dims4D = phi::funcs::sparse::Dims4D;
  * output: the outputs
  * index_size: the size of indices
  * slice_size: slice size corresponding to each index, here is the channel size
-**/
+ **/
 template <typename T, typename IndexT = int>
 __global__ void GatherKernel(const T* params,
                              const IndexT* indices,
@@ -115,7 +115,7 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
  * out_dims: indicates the output dims
  * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
  * rulebook_out_indexs: the output index in rulebook
-**/
+ **/
 template <typename T>
 __global__ void UpdateIndexKernel(const T* unique_keys,
                                   const int* unique_values,
@@ -198,7 +198,7 @@ __global__ void UpdateOutIndexAndCounterAfterLowerBound(
  * rulebook: the rulebook to save the kernel index, input index and output index
  * counter: save the number of times each location in the kernel participates in
  *the caculation
-**/
+ **/
 template <typename T>
 __global__ void ProductRuleBookKernel(const T* x_indices,
                                       const Dims4D x_dims,
@@ -421,8 +421,8 @@ int ProductRuleBook(const Context& dev_ctx,
                                 rulebook_ptr,
                                 rulebook_ptr + 3 * rulebook_len,
                                 -1);
-    phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
-        rulebook_ptr, last, bound_ptr);
+    phi::funcs::sparse::DistanceKernel<IntT>
+        <<<1, 1, 0, dev_ctx.stream()>>>(rulebook_ptr, last, bound_ptr);
     phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
                                        bound_ptr,
                                        sizeof(IntT),
@@ -525,18 +525,18 @@ int ProductRuleBook(const Context& dev_ctx,
 
     config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
-    UpdateIndexKernel<IntT><<<config.block_per_grid.x,
-                              config.thread_per_block.x,
-                              0,
-                              dev_ctx.stream()>>>(
-        unique_key_ptr,
-        unique_value_ptr,
-        out_index_ptr,
-        out_non_zero_num,
-        rulebook_len,
-        d_out_dims,
-        out_indices_ptr,
-        rulebook_ptr + 2 * rulebook_len);
+    UpdateIndexKernel<IntT>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(unique_key_ptr,
+                               unique_value_ptr,
+                               out_index_ptr,
+                               out_non_zero_num,
+                               rulebook_len,
+                               d_out_dims,
+                               out_indices_ptr,
+                               rulebook_ptr + 2 * rulebook_len);
     out->SetMember(out_indices, out_values, out_dims, true);
   } else {
     DenseTensor out_indices =
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index c19bf67be26..d83d064418e 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -150,15 +150,15 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
 
   config = phi::backends::gpu::GetGpuLaunchConfig1D(
       dev_ctx, rulebook_len * out_channels, 1);
-  GatherKernel<T, IntT><<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          0,
-                          dev_ctx.stream()>>>(
-      out_grad.non_zero_elements().data<T>(),
-      rulebook_ptr + rulebook_len * 2,
-      out_grad_features_ptr,
-      rulebook_len,
-      out_channels);
+  GatherKernel<T, IntT>
+      <<<config.block_per_grid.x,
+         config.thread_per_block.x,
+         0,
+         dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
+                             rulebook_ptr + rulebook_len * 2,
+                             out_grad_features_ptr,
+                             rulebook_len,
+                             out_channels);
 
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 30f0482a0cc..c3b6c8c6abc 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -157,37 +157,37 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
     set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
     config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1);
-    phi::funcs::ScatterCUDAKernel<T, IntT><<<config.block_per_grid,
-                                             config.thread_per_block,
-                                             0,
-                                             dev_ctx.stream()>>>(
-        out_features_ptr,
-        rulebook_ptr + 2 * n,
-        out_values_ptr,
-        n,
-        out_channels,
-        false);
+    phi::funcs::ScatterCUDAKernel<T, IntT>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(out_features_ptr,
+                               rulebook_ptr + 2 * n,
+                               out_values_ptr,
+                               n,
+                               out_channels,
+                               false);
   } else {
     config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, out->nnz() * out_channels, 1);
-    phi::funcs::sparse::ScatterKernel<T><<<config.block_per_grid.x,
-                                           config.thread_per_block.x,
-                                           0,
-                                           dev_ctx.stream()>>>(
-        out_features_ptr,
-        unique_value.data<int>(),
-        out_index.data<int>(),
-        out->nnz(),
-        n,
-        out_channels,
-        out_values_ptr);
+    phi::funcs::sparse::ScatterKernel<T>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(out_features_ptr,
+                               unique_value.data<int>(),
+                               out_index.data<int>(),
+                               out->nnz(),
+                               n,
+                               out_channels,
+                               out_values_ptr);
   }
 }
 /**
  * x: (N, D, H, W, C)
  * kernel: (D, H, W, C, OC)
  * out: (N, D, H, W, OC)
-**/
+ **/
 template <typename T, typename Context>
 void Conv3dKernel(const Context& dev_ctx,
                   const SparseCooTensor& x,
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index 81c63c48ebf..cbbdc122f61 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -108,7 +108,7 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx,
  * @brief Filter the DenseTensor x by the
  * mask.non_zero_indices() and output a SparseCooTensor
  * x and mask must have the same shape.
-**/
+ **/
 template <typename T, typename Context>
 void SparseMaskKernel(const Context& dev_ctx,
                       const DenseTensor& x,
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
index c22e67eef67..694fe667c87 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
@@ -24,6 +22,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -105,18 +104,18 @@ void MaxPoolGradGPUKernel(const GPUContext& dev_ctx,
 
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, counter[i] * in_channels, 1);
-    MaxPoolGradCudaKernel<T, IntT><<<config.block_per_grid.x,
-                                     config.thread_per_block.x,
-                                     0,
-                                     dev_ctx.stream()>>>(
-        in_features_ptr,
-        out_features_ptr,
-        out_grad_ptr,
-        rulebook_ptr + offsets[i] + rulebook_len,
-        counter[i],
-        rulebook_len,
-        in_channels,
-        x_grad_ptr);
+    MaxPoolGradCudaKernel<T, IntT>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(in_features_ptr,
+                               out_features_ptr,
+                               out_grad_ptr,
+                               rulebook_ptr + offsets[i] + rulebook_len,
+                               counter[i],
+                               rulebook_len,
+                               in_channels,
+                               x_grad_ptr);
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
index e3eb7aa2433..534afbd0f14 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -46,7 +45,7 @@ __global__ void MaxPoolCudaKernel(const T* in_features_ptr,
  * x: (N, D, H, W, C)
  * kernel: (D, H, W, C, OC)
  * out: (N, D, H, W, OC)
-**/
+ **/
 template <typename T, typename IntT = int>
 void MaxPoolGPUKernel(const GPUContext& dev_ctx,
                       const SparseCooTensor& x,
@@ -113,16 +112,16 @@ void MaxPoolGPUKernel(const GPUContext& dev_ctx,
 
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, counter[i] * in_channels, 1);
-    MaxPoolCudaKernel<T, IntT><<<config.block_per_grid.x,
-                                 config.thread_per_block.x,
-                                 0,
-                                 dev_ctx.stream()>>>(
-        in_features_ptr,
-        rulebook_ptr + offsets[i] + rulebook_len,
-        counter[i],
-        rulebook_len,
-        in_channels,
-        out_features_ptr);
+    MaxPoolCudaKernel<T, IntT>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(in_features_ptr,
+                               rulebook_ptr + offsets[i] + rulebook_len,
+                               counter[i],
+                               rulebook_len,
+                               in_channels,
+                               out_features_ptr);
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index b208e70e040..38553d1fe1d 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <thrust/execution_policy.h>
 #include <thrust/remove.h>
 
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -22,8 +23,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
-#include "paddle/fluid/platform/enforce.h"
-
 namespace phi {
 namespace sparse {
 
@@ -526,17 +525,17 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
-  KernelSparseCooToDense<T, int64_t><<<config.block_per_grid.x,
-                                       config.thread_per_block.x,
-                                       0,
-                                       dev_ctx.stream()>>>(
-      indices.data<int64_t>(),
-      d_sparse_offsets.data<int64_t>(),
-      x_data,
-      out_data,
-      non_zero_num,
-      base_offset,
-      sparse_dim);
+  KernelSparseCooToDense<T, int64_t>
+      <<<config.block_per_grid.x,
+         config.thread_per_block.x,
+         0,
+         dev_ctx.stream()>>>(indices.data<int64_t>(),
+                             d_sparse_offsets.data<int64_t>(),
+                             x_data,
+                             out_data,
+                             non_zero_num,
+                             base_offset,
+                             sparse_dim);
 }
 
 }  // namespace sparse
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
index 15d78692f4f..69677be34b2 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
 
diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h
index 6baac241426..1a426472c02 100644
--- a/paddle/phi/kernels/split_kernel.h
+++ b/paddle/phi/kernels/split_kernel.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/core/dense_tensor.h"
-
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 
diff --git a/paddle/phi/kernels/strings/case_utils.h b/paddle/phi/kernels/strings/case_utils.h
index 2c30102a5a6..66744c6915b 100644
--- a/paddle/phi/kernels/strings/case_utils.h
+++ b/paddle/phi/kernels/strings/case_utils.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 
diff --git a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
index 41889f9cc5e..efd69c6e2f9 100644
--- a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
+++ b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/strings/strings_copy_kernel.h"
-#include "paddle/phi/core/kernel_registry.h"
 
 #include "glog/logging.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 namespace strings {
diff --git a/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu b/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
index 5cb4d21ec99..7a2d61f29f7 100644
--- a/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
+++ b/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/strings/strings_copy_kernel.h"
-
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/gpu/gpu_helper.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/strings/gpu/copy_utils.h"
+#include "paddle/phi/kernels/strings/strings_copy_kernel.h"
 
 using pstring = ::phi::dtype::pstring;
 
diff --git a/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu b/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
index 53916def37b..05d868f4db8 100644
--- a/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
+++ b/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
@@ -9,12 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/strings/strings_lower_upper_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/strings/strings_lower_upper_kernel.h"
 #include "paddle/phi/kernels/strings/unicode.h"
 
 using pstring = ::phi::dtype::pstring;
@@ -44,9 +43,8 @@ struct AsciiCaseConverter<phi::GPUContext, CharConverter> {
     dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
     dim3 grid_size =
         dim3((num + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
-    StringCaseConvertCUDAKernel<
-        CharConverter><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-        out, in, num);
+    StringCaseConvertCUDAKernel<CharConverter>
+        <<<grid_size, block_size, 0, dev_ctx.stream()>>>(out, in, num);
   }
 };
 
diff --git a/paddle/phi/kernels/strings/strings_lower_upper_kernel.h b/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
index db6c267a858..36486bc3ec6 100644
--- a/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
+++ b/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
@@ -93,7 +93,8 @@ struct AsciiCaseConverter {
 };
 
 template <typename DeviceContext,
-          template <typename DeviceContextT> typename CharConverter>
+          template <typename DeviceContextT>
+          typename CharConverter>
 struct UTF8CaseConverter {
   void operator()(const DeviceContext& dev_ctx,
                   const pstring* in,
diff --git a/paddle/phi/kernels/strings/unicode.cc b/paddle/phi/kernels/strings/unicode.cc
index bca75c08bce..9f636809de8 100644
--- a/paddle/phi/kernels/strings/unicode.cc
+++ b/paddle/phi/kernels/strings/unicode.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/strings/unicode.h"
+
 #include <utf8proc.h>
+
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/kernels/strings/unicode_flag.h"
 
diff --git a/paddle/phi/kernels/strings/unicode.h b/paddle/phi/kernels/strings/unicode.h
index f6c5248faeb..45e41b72d08 100644
--- a/paddle/phi/kernels/strings/unicode.h
+++ b/paddle/phi/kernels/strings/unicode.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <cstring>
 #include <memory>
+
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/macros.h"
 
diff --git a/paddle/phi/kernels/transpose_grad_kernel.h b/paddle/phi/kernels/transpose_grad_kernel.h
index 33d4ca7e3c6..e224da81a25 100644
--- a/paddle/phi/kernels/transpose_grad_kernel.h
+++ b/paddle/phi/kernels/transpose_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <vector>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/ops/compat/matrix_rank_sig.cc b/paddle/phi/ops/compat/matrix_rank_sig.cc
index 40dc29579b4..bb884e9c349 100644
--- a/paddle/phi/ops/compat/matrix_rank_sig.cc
+++ b/paddle/phi/ops/compat/matrix_rank_sig.cc
@@ -27,7 +27,9 @@ KernelSignature MatrixRankOpArgumentMapping(const ArgumentMappingContext& ctx) {
     return KernelSignature("matrix_rank",
                            {"X"},
                            {
-                               "tol", "use_default_tol", "hermitian",
+                               "tol",
+                               "use_default_tol",
+                               "hermitian",
                            },
                            {"Out"});
   }
diff --git a/paddle/phi/ops/compat/segment_pool_sig.cc b/paddle/phi/ops/compat/segment_pool_sig.cc
index db07343f9ad..62b2b08f4c1 100644
--- a/paddle/phi/ops/compat/segment_pool_sig.cc
+++ b/paddle/phi/ops/compat/segment_pool_sig.cc
@@ -20,7 +20,11 @@ KernelSignature SegmentPoolGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("segment_pool_grad",
                          {
-                             "X", "SegmentIds", "Out", "SummedIds", "Out@GRAD",
+                             "X",
+                             "SegmentIds",
+                             "Out",
+                             "SummedIds",
+                             "Out@GRAD",
                          },
                          {"pooltype"},
                          {"X@GRAD"});
diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h
index 16143fb11e0..322f7b27abd 100644
--- a/paddle/phi/tests/api/scale_api.h
+++ b/paddle/phi/tests/api/scale_api.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "glog/logging.h"
-
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/phi/tests/api/test_cast_api.cc b/paddle/phi/tests/api/test_cast_api.cc
index 5448fb9d424..b627cc35283 100644
--- a/paddle/phi/tests/api/test_cast_api.cc
+++ b/paddle/phi/tests/api/test_cast_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_concat_api.cc b/paddle/phi/tests/api/test_concat_api.cc
index 824b72b97ac..d271e1cc5dd 100644
--- a/paddle/phi/tests/api/test_concat_api.cc
+++ b/paddle/phi/tests/api/test_concat_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_conj_api.cc b/paddle/phi/tests/api/test_conj_api.cc
index 62a588dff12..ea8791f2181 100644
--- a/paddle/phi/tests/api/test_conj_api.cc
+++ b/paddle/phi/tests/api/test_conj_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc
index 21d5eef4098..7e8204ea6c7 100644
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
diff --git a/paddle/phi/tests/api/test_dot_api.cc b/paddle/phi/tests/api/test_dot_api.cc
index 3fcd4e8a01d..39ba6c666c6 100644
--- a/paddle/phi/tests/api/test_dot_api.cc
+++ b/paddle/phi/tests/api/test_dot_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_elementwise_api.cc b/paddle/phi/tests/api/test_elementwise_api.cc
index fb4c68a87cb..f9c10e8c801 100644
--- a/paddle/phi/tests/api/test_elementwise_api.cc
+++ b/paddle/phi/tests/api/test_elementwise_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_embedding_api.cc b/paddle/phi/tests/api/test_embedding_api.cc
index 6ccd382786b..a590bf2ce62 100644
--- a/paddle/phi/tests/api/test_embedding_api.cc
+++ b/paddle/phi/tests/api/test_embedding_api.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/backward/backward_api.h"
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/tests/api/test_empty_api.cc b/paddle/phi/tests/api/test_empty_api.cc
index 48adbe1bd26..63632474271 100644
--- a/paddle/phi/tests/api/test_empty_api.cc
+++ b/paddle/phi/tests/api/test_empty_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_fill_api.cc b/paddle/phi/tests/api/test_fill_api.cc
index 523fa895d14..cae56fd6634 100644
--- a/paddle/phi/tests/api/test_fill_api.cc
+++ b/paddle/phi/tests/api/test_fill_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc
index 0d4ec7bd4f5..c54c5398280 100644
--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/backward/backward_api.h"
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/tests/api/test_mean_api.cc b/paddle/phi/tests/api/test_mean_api.cc
index af47f2cd771..717423c8419 100644
--- a/paddle/phi/tests/api/test_mean_api.cc
+++ b/paddle/phi/tests/api/test_mean_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_pten_exception.cc b/paddle/phi/tests/api/test_pten_exception.cc
index 83743810487..92c44684f3a 100644
--- a/paddle/phi/tests/api/test_pten_exception.cc
+++ b/paddle/phi/tests/api/test_pten_exception.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include <iostream>
 #include <string>
+
 #include "gtest/gtest.h"
 #include "paddle/phi/api/ext/exception.h"
 
diff --git a/paddle/phi/tests/api/test_reshape_api.cc b/paddle/phi/tests/api/test_reshape_api.cc
index 4a857e2d1dc..46b73778bc5 100644
--- a/paddle/phi/tests/api/test_reshape_api.cc
+++ b/paddle/phi/tests/api/test_reshape_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_scale_api.cc b/paddle/phi/tests/api/test_scale_api.cc
index 5f1e1189466..2795ebcf286 100644
--- a/paddle/phi/tests/api/test_scale_api.cc
+++ b/paddle/phi/tests/api/test_scale_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_scale_benchmark.cc b/paddle/phi/tests/api/test_scale_benchmark.cc
index e2870a780ae..dbada896baf 100644
--- a/paddle/phi/tests/api/test_scale_benchmark.cc
+++ b/paddle/phi/tests/api/test_scale_benchmark.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_slice_api.cc b/paddle/phi/tests/api/test_slice_api.cc
index ee2ade0229f..46245c45f1e 100644
--- a/paddle/phi/tests/api/test_slice_api.cc
+++ b/paddle/phi/tests/api/test_slice_api.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
index c00113389ad..bbdb2f70d7f 100644
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -13,12 +13,11 @@ the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/include/sparse_api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc
index da66334ced7..e0201755511 100644
--- a/paddle/phi/tests/api/test_sparse_utils_api.cc
+++ b/paddle/phi/tests/api/test_sparse_utils_api.cc
@@ -13,12 +13,11 @@ the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/include/sparse_api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_split_api.cc b/paddle/phi/tests/api/test_split_api.cc
index 1b84e7793cf..64dab30a6a7 100644
--- a/paddle/phi/tests/api/test_split_api.cc
+++ b/paddle/phi/tests/api/test_split_api.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_strings_empty_api.cc b/paddle/phi/tests/api/test_strings_empty_api.cc
index 5f7e373a712..3286498c2c0 100644
--- a/paddle/phi/tests/api/test_strings_empty_api.cc
+++ b/paddle/phi/tests/api/test_strings_empty_api.cc
@@ -13,6 +13,7 @@ the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/strings_api.h"
diff --git a/paddle/phi/tests/api/test_strings_lower_upper_api.cc b/paddle/phi/tests/api/test_strings_lower_upper_api.cc
index ed911298bde..c8abae1836f 100644
--- a/paddle/phi/tests/api/test_strings_lower_upper_api.cc
+++ b/paddle/phi/tests/api/test_strings_lower_upper_api.cc
@@ -13,6 +13,7 @@ the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/strings_api.h"
diff --git a/paddle/phi/tests/api/test_sum_api.cc b/paddle/phi/tests/api/test_sum_api.cc
index 9781d70d2b9..935435162aa 100644
--- a/paddle/phi/tests/api/test_sum_api.cc
+++ b/paddle/phi/tests/api/test_sum_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/api/test_to_api.cc b/paddle/phi/tests/api/test_to_api.cc
index 4e8755be0c7..dcf43348251 100644
--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc
index f93394f31df..c1550e31fae 100644
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 
 #include "paddle/phi/api/ext/exception.h"
diff --git a/paddle/phi/tests/common/test_data_layout.cc b/paddle/phi/tests/common/test_data_layout.cc
index b5b6ed119be..3a53e25f92b 100644
--- a/paddle/phi/tests/common/test_data_layout.cc
+++ b/paddle/phi/tests/common/test_data_layout.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <sstream>
 
diff --git a/paddle/phi/tests/common/test_data_type.cc b/paddle/phi/tests/common/test_data_type.cc
index 517e2ee2ff8..4d3d1de6492 100644
--- a/paddle/phi/tests/common/test_data_type.cc
+++ b/paddle/phi/tests/common/test_data_type.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <sstream>
 
diff --git a/paddle/phi/tests/common/test_int_array.cc b/paddle/phi/tests/common/test_int_array.cc
index a6278ee4a34..30ad7cdd74c 100644
--- a/paddle/phi/tests/common/test_int_array.cc
+++ b/paddle/phi/tests/common/test_int_array.cc
@@ -12,17 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "gtest/gtest.h"
 #include "paddle/phi/api/include/api.h"
-
 #include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/kernels/full_kernel.h"
-
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "gtest/gtest.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
diff --git a/paddle/phi/tests/common/test_place.cc b/paddle/phi/tests/common/test_place.cc
index ed2eb7126ed..8b1dfc60acf 100644
--- a/paddle/phi/tests/common/test_place.cc
+++ b/paddle/phi/tests/common/test_place.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/common/place.h"
-
 #include <map>  // NOLINT
+
 #include "gtest/gtest.h"
+#include "paddle/phi/common/place.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu
index 6b0caa175dc..89b41ef1e58 100644
--- a/paddle/phi/tests/common/test_scalar.cu
+++ b/paddle/phi/tests/common/test_scalar.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <map>  // NOLINT
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/include/tensor.h"
diff --git a/paddle/phi/tests/core/test_dense_tensor.cc b/paddle/phi/tests/core/test_dense_tensor.cc
index 42814317b9c..f6a3e3fa413 100644
--- a/paddle/phi/tests/core/test_dense_tensor.cc
+++ b/paddle/phi/tests/core/test_dense_tensor.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gtest/gtest.h"
-
 #include "glog/logging.h"
+#include "gtest/gtest.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/tests/core/allocator.h"
 
diff --git a/paddle/phi/tests/core/test_dim.cu b/paddle/phi/tests/core/test_dim.cu
index 5d8919d8c54..2a449191367 100644
--- a/paddle/phi/tests/core/test_dim.cu
+++ b/paddle/phi/tests/core/test_dim.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <thrust/device_vector.h>
+
 #include <sstream>
 
 #include "gtest/gtest.h"
diff --git a/paddle/phi/tests/core/test_intrusive_ptr.cc b/paddle/phi/tests/core/test_intrusive_ptr.cc
index d9d6008f17b..e0888f89ce6 100644
--- a/paddle/phi/tests/core/test_intrusive_ptr.cc
+++ b/paddle/phi/tests/core/test_intrusive_ptr.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <vector>
 
 #include "gtest/gtest.h"
-
 #include "paddle/phi/core/utils/intrusive_ptr.h"
 #include "paddle/phi/core/utils/intrusive_ref_counter.h"
 
diff --git a/paddle/phi/tests/core/test_kernel_factory.cc b/paddle/phi/tests/core/test_kernel_factory.cc
index 490d4967eeb..44ea9fba119 100644
--- a/paddle/phi/tests/core/test_kernel_factory.cc
+++ b/paddle/phi/tests/core/test_kernel_factory.cc
@@ -15,13 +15,12 @@ limitations under the License. */
 #include <iostream>
 #include <sstream>
 
+#include "gtest/gtest.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#include "gtest/gtest.h"
-
 PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
 
 namespace phi {
diff --git a/paddle/phi/tests/core/test_rw_lock.cc b/paddle/phi/tests/core/test_rw_lock.cc
index 7a9f72cb2bb..59d1aed2c3d 100644
--- a/paddle/phi/tests/core/test_rw_lock.cc
+++ b/paddle/phi/tests/core/test_rw_lock.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/core/utils/rw_lock.h"
-
 #include <gtest/gtest.h>  // NOLINT
-#include <thread>         // NOLINT
+
+#include <thread>  // NOLINT
+
+#include "paddle/phi/core/utils/rw_lock.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/core/test_selected_rows.cc b/paddle/phi/tests/core/test_selected_rows.cc
index b6229eda604..793737a6fb4 100644
--- a/paddle/phi/tests/core/test_selected_rows.cc
+++ b/paddle/phi/tests/core/test_selected_rows.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <time.h>
+
 #include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
diff --git a/paddle/phi/tests/core/test_sparse_coo_tensor.cc b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
index 5e7642bbfdc..e9ee1dde6b2 100644
--- a/paddle/phi/tests/core/test_sparse_coo_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gtest/gtest.h"
-
 #include "glog/logging.h"
+#include "gtest/gtest.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/tests/core/allocator.h"
diff --git a/paddle/phi/tests/core/test_sparse_csr_tensor.cc b/paddle/phi/tests/core/test_sparse_csr_tensor.cc
index 1f9d48364a9..7fad7bac399 100644
--- a/paddle/phi/tests/core/test_sparse_csr_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_csr_tensor.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
diff --git a/paddle/phi/tests/core/test_string_tensor.cc b/paddle/phi/tests/core/test_string_tensor.cc
index 7a3ad7ffb3a..53bf51d38c0 100644
--- a/paddle/phi/tests/core/test_string_tensor.cc
+++ b/paddle/phi/tests/core/test_string_tensor.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <sstream>
 #include <string>
 #include <utility>
-#include "gtest/gtest.h"
 
+#include "gtest/gtest.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/pstring.h"
diff --git a/paddle/phi/tests/core/test_type_info.cc b/paddle/phi/tests/core/test_type_info.cc
index 1bb2aeb2b7a..6d023268c7b 100644
--- a/paddle/phi/tests/core/test_type_info.cc
+++ b/paddle/phi/tests/core/test_type_info.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-
 #include "paddle/phi/core/utils/type_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/tests/core/unroll_array_ops_test.cc b/paddle/phi/tests/core/unroll_array_ops_test.cc
index ddac4608f7e..03a4beb374e 100644
--- a/paddle/phi/tests/core/unroll_array_ops_test.cc
+++ b/paddle/phi/tests/core/unroll_array_ops_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/core/utils/unroll_array_ops.h"
 
 #include <gtest/gtest.h>
+
 #include <array>
 
 namespace phi {
diff --git a/paddle/phi/tests/kernels/test_cast_dev_api.cc b/paddle/phi/tests/kernels/test_cast_dev_api.cc
index 957b9493471..179e44f0f0f 100644
--- a/paddle/phi/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_cast_dev_api.cc
@@ -14,16 +14,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/cast_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cast_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_concat_dev_api.cc b/paddle/phi/tests/kernels/test_concat_dev_api.cc
index 7f954085f60..0dd58b1bba9 100644
--- a/paddle/phi/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_concat_dev_api.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/concat_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/concat_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_conj_dev_api.cc b/paddle/phi/tests/kernels/test_conj_dev_api.cc
index 3d2a69df2f9..5ac676ffcbc 100644
--- a/paddle/phi/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_conj_dev_api.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/complex_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/complex_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc
index 460d85f8313..9eba14ebc81 100644
--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/copy_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_creation_dev_api.cc b/paddle/phi/tests/kernels/test_creation_dev_api.cc
index 8c2c8642ab9..2dcd8739991 100644
--- a/paddle/phi/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_creation_dev_api.cc
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/full_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_dot_dev_api.cc b/paddle/phi/tests/kernels/test_dot_dev_api.cc
index 457e3952593..de20907cadf 100644
--- a/paddle/phi/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_dot_dev_api.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/dot_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/dot_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
index 36b200d4d44..63f8b86a534 100644
--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/elementwise_divide_kernel.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 namespace phi {
 namespace tests {
 
diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
index e3f2e8b57e3..23ee9869c0e 100644
--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/flatten_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/flatten_kernel.h"
 
 PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
diff --git a/paddle/phi/tests/kernels/test_matmul_dev_api.cc b/paddle/phi/tests/kernels/test_matmul_dev_api.cc
index b8e201d7dc5..f25acaf9bcc 100644
--- a/paddle/phi/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_matmul_dev_api.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/matmul_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_mean_dev_api.cc b/paddle/phi/tests/kernels/test_mean_dev_api.cc
index 92fc7f3c92a..6f3f91a7dbe 100644
--- a/paddle/phi/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/reduce_mean_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/reduce_mean_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_reshape_dev_api.cc b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
index 7de039372fa..f0f521d57db 100644
--- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/reshape_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/reshape_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_scale_dev_api.cc b/paddle/phi/tests/kernels/test_scale_dev_api.cc
index c4c80ce79af..eff18bdeeca 100644
--- a/paddle/phi/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_scale_dev_api.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/scale_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
index 05781156cd1..d1c464e4b1c 100644
--- a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/place.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/activation_grad_kernel.h"
 #include "paddle/phi/kernels/activation_kernel.h"
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 9fb0e569264..b7d56cb0d2b 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 namespace phi {
 namespace tests {
 
diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
index 8f7288d70d7..5640da399f4 100644
--- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
 
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 namespace phi {
 namespace tests {
 
@@ -264,7 +264,22 @@ TEST(DEV_API, sparse_maxpool) {
   std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
   std::vector<float> features = {1, 2, 3};
   std::vector<int> out_indices = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      1,
+      1,
+      0,
+      1,
+      0,
+      1,
   };
   std::vector<float> out_features = {2, 2, 3, 3};
   std::vector<float> x_grad = {0, 4, 6};
@@ -330,7 +345,22 @@ TEST(DEV_API, sparse_maxpool_channel) {
   std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
   std::vector<float> features = {1, 1, 2, 2, 3, 3};
   std::vector<int> out_indices = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      1,
+      1,
+      0,
+      1,
+      0,
+      1,
   };
   std::vector<float> out_features = {2, 2, 2, 2, 3, 3, 3, 3};
   std::vector<float> x_grad = {0, 0, 4, 4, 6, 6};
@@ -364,7 +394,22 @@ TEST(DEV_API, sparse_maxpool3d) {
   std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
   std::vector<float> features = {1, 1, 2, 2, 3, 3};
   std::vector<int> out_indices = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      1,
+      1,
+      0,
+      1,
+      0,
+      1,
   };
   std::vector<float> out_features = {2, 2, 2, 2, 3, 3, 3, 3};
   std::vector<float> x_grad = {0, 0, 4, 4, 6, 6};
diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
index 93728ad31b0..0c1a7bbb3d8 100644
--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -13,18 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <memory>
 
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
-
-#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
 namespace phi {
 namespace tests {
diff --git a/paddle/phi/tests/kernels/test_split_dev_api.cc b/paddle/phi/tests/kernels/test_split_dev_api.cc
index d5160933c1f..a358fcdf28d 100644
--- a/paddle/phi/tests/kernels/test_split_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_split_dev_api.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/split_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/split_kernel.h"
 namespace phi {
 namespace tests {
 
diff --git a/paddle/phi/tests/kernels/test_strings_copy_dev_api.cc b/paddle/phi/tests/kernels/test_strings_copy_dev_api.cc
index 3984cae52d4..6cf75260be7 100644
--- a/paddle/phi/tests/kernels/test_strings_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_strings_copy_dev_api.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <memory>
 #include <string>
+
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/pstring.h"
diff --git a/paddle/phi/tests/kernels/test_strings_copy_dev_api.cu b/paddle/phi/tests/kernels/test_strings_copy_dev_api.cu
index f04f66f50aa..6a1af65aaa9 100644
--- a/paddle/phi/tests/kernels/test_strings_copy_dev_api.cu
+++ b/paddle/phi/tests/kernels/test_strings_copy_dev_api.cu
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <memory>
 #include <string>
+
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/pstring.h"
diff --git a/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cc b/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cc
index 57353d386dc..7da4ac19baf 100644
--- a/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cc
@@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <memory>
 #include <string>
 
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/kernels/strings/strings_empty_kernel.h"
 #include "paddle/phi/kernels/strings/strings_lower_upper_kernel.h"
-
-#include "paddle/phi/backends/all_context.h"
 namespace phi {
 namespace tests {
 
diff --git a/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cu b/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cu
index 3b4bff00143..a04da1a12d2 100644
--- a/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cu
+++ b/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include <stdio.h>
+
 #include <algorithm>
 #include <memory>
 #include <string>
@@ -40,8 +41,8 @@ namespace tests {
 namespace framework = paddle::framework;
 using DDim = phi::DDim;
 using pstring = ::phi::dtype::pstring;
-using phi::GPUPlace;
 using phi::CPUPlace;
+using phi::GPUPlace;
 
 TEST(DEV_API, strings_cast_convert) {
   auto gpu0 = GPUPlace();
diff --git a/paddle/phi/tests/kernels/test_sum_dev_api.cc b/paddle/phi/tests/kernels/test_sum_dev_api.cc
index 9e889ab4ea4..2cd677373f4 100644
--- a/paddle/phi/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <memory>
 
-#include "paddle/phi/kernels/reduce_sum_kernel.h"
+#include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
 namespace phi {
 namespace tests {
 
diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc
index 4379dfd7cc4..204b7f359a6 100644
--- a/paddle/phi/tests/ops/test_op_signature.cc
+++ b/paddle/phi/tests/ops/test_op_signature.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/tests/ops/test_op_signature.h"
 
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <unordered_set>
 
diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
index 1535f40b700..745f263208f 100644
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
diff --git a/paddle/utils/flat_hash_map.h b/paddle/utils/flat_hash_map.h
index 64a75fffa57..56318ab90e6 100644
--- a/paddle/utils/flat_hash_map.h
+++ b/paddle/utils/flat_hash_map.h
@@ -36,11 +36,11 @@ struct functor_storage : Functor {
   functor_storage() = default;
   functor_storage(const Functor &functor) : Functor(functor) {}
   template <typename... Args>
-  Result operator()(Args &&... args) {
+  Result operator()(Args &&...args) {
     return static_cast<Functor &>(*this)(std::forward<Args>(args)...);
   }
   template <typename... Args>
-  Result operator()(Args &&... args) const {
+  Result operator()(Args &&...args) const {
     return static_cast<const Functor &>(*this)(std::forward<Args>(args)...);
   }
 };
@@ -136,7 +136,7 @@ struct sherwood_v3_entry {
   bool is_empty() const { return distance_from_desired < 0; }
   bool is_at_desired_position() const { return distance_from_desired <= 0; }
   template <typename... Args>
-  void emplace(int8_t distance, Args &&... args) {
+  void emplace(int8_t distance, Args &&...args) {
     new (std::addressof(value)) T(std::forward<Args>(args)...);
     distance_from_desired = distance;
   }
@@ -317,9 +317,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
   }
   sherwood_v3_table(sherwood_v3_table &&other,
                     const ArgumentAlloc &alloc) noexcept
-      : EntryAlloc(alloc),
-        Hasher(std::move(other)),
-        Equal(std::move(other)) {
+      : EntryAlloc(alloc), Hasher(std::move(other)), Equal(std::move(other)) {
     swap_pointers(other);
   }
   sherwood_v3_table &operator=(const sherwood_v3_table &other) {
@@ -476,7 +474,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
   }
 
   template <typename Key, typename... Args>
-  std::pair<iterator, bool> emplace(Key &&key, Args &&... args) {
+  std::pair<iterator, bool> emplace(Key &&key, Args &&...args) {
     size_t index =
         hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
     EntryPointer current_entry = entries + ptrdiff_t(index);
@@ -499,7 +497,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
     return emplace(std::move(value));
   }
   template <typename... Args>
-  iterator emplace_hint(const_iterator, Args &&... args) {
+  iterator emplace_hint(const_iterator, Args &&...args) {
     return emplace(std::forward<Args>(args)...).first;
   }
   iterator insert(const_iterator, const value_type &value) {
@@ -702,7 +700,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
   emplace_new_key(int8_t distance_from_desired,
                   EntryPointer current_entry,
                   Key &&key,
-                  Args &&... args) {
+                  Args &&...args) {
     using std::swap;
     if (num_slots_minus_one == 0 || distance_from_desired == max_lookups ||
         num_elements + 1 >
@@ -789,7 +787,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
     }
   };
 };
-}
+}  // namespace detailv3
 
 struct prime_number_hash_policy {
   static size_t mod0(size_t) { return 0llu; }
@@ -1708,7 +1706,7 @@ class flat_hash_set
   flat_hash_set() {}
 
   template <typename... Args>
-  std::pair<typename Table::iterator, bool> emplace(Args &&... args) {
+  std::pair<typename Table::iterator, bool> emplace(Args &&...args) {
     return Table::emplace(T(std::forward<Args>(args)...));
   }
   std::pair<typename Table::iterator, bool> emplace(const key_type &arg) {
diff --git a/paddle/utils/none.h b/paddle/utils/none.h
index d2da8f26a11..965855f5562 100644
--- a/paddle/utils/none.h
+++ b/paddle/utils/none.h
@@ -21,7 +21,7 @@ namespace paddle {
 
 namespace detail {
 struct none_helper {};
-}
+}  // namespace detail
 
 typedef int detail::none_helper::*none_t;
 
diff --git a/paddle/utils/optional.h b/paddle/utils/optional.h
index 2b5a657f4d4..ec245500551 100644
--- a/paddle/utils/optional.h
+++ b/paddle/utils/optional.h
@@ -38,7 +38,7 @@ template <class T, class Factory>
 void construct(Factory const& factory, void* address) {
   factory.template apply<T>(address);
 }
-}
+}  // namespace paddle_optional_detail
 
 template <typename T>
 class optional;
diff --git a/paddle/utils/small_vector.h b/paddle/utils/small_vector.h
index 27db9ae1882..5a8abdb4d23 100644
--- a/paddle/utils/small_vector.h
+++ b/paddle/utils/small_vector.h
@@ -52,8 +52,7 @@ class iterator_range {
   template <typename Container>
   iterator_range(Container &&c)
       // TODO: Consider ADL/non-member begin/end calls.
-      : begin_iterator(c.begin()),
-        end_iterator(c.end()) {}
+      : begin_iterator(c.begin()), end_iterator(c.end()) {}
   iterator_range(IteratorT begin_iterator, IteratorT end_iterator)
       : begin_iterator(std::move(begin_iterator)),
         end_iterator(std::move(end_iterator)) {}
@@ -132,10 +131,8 @@ class small_vector_base {
 };
 
 template <class T>
-using SmallVectorSizeType =
-    typename std::conditional<sizeof(T) < 4 && sizeof(void *) >= 8,
-                              uint64_t,
-                              uint32_t>::type;
+using SmallVectorSizeType = typename std::
+    conditional<sizeof(T) < 4 && sizeof(void *) >= 8, uint64_t, uint32_t>::type;
 
 /// Figure out the offset of the first element.
 template <class T, typename = void>
@@ -296,8 +293,8 @@ class small_vector_template_common
   using Base::size;
 
   // forward iterator creation methods.
-  iterator begin() { return (iterator) this->BeginX; }
-  const_iterator begin() const { return (const_iterator) this->BeginX; }
+  iterator begin() { return (iterator)this->BeginX; }
+  const_iterator begin() const { return (const_iterator)this->BeginX; }
   iterator end() { return begin() + size(); }
   const_iterator end() const { return begin() + size(); }
 
@@ -451,7 +448,7 @@ class small_vector_template_base : public small_vector_template_common<T> {
   }
 
   template <typename... ArgTypes>
-  T &growAndEmplaceBack(ArgTypes &&... Args) {
+  T &growAndEmplaceBack(ArgTypes &&...Args) {
     // Grow manually in case one of Args is an internal reference.
     size_t NewCapacity;
     T *NewElts = mallocForGrow(0, NewCapacity);
@@ -599,7 +596,7 @@ class small_vector_template_base<T, true>
   }
 
   template <typename... ArgTypes>
-  T &growAndEmplaceBack(ArgTypes &&... Args) {
+  T &growAndEmplaceBack(ArgTypes &&...Args) {
     // Use push_back with a copy in case Args has an internal reference,
     // side-stepping reference invalidation problems without losing the realloc
     // optimization.
@@ -972,7 +969,7 @@ class small_vector_impl : public small_vector_template_base<T> {
   }
 
   template <typename... ArgTypes>
-  reference emplace_back(ArgTypes &&... Args) {
+  reference emplace_back(ArgTypes &&...Args) {
     if (this->size() >= this->capacity())
       return this->growAndEmplaceBack(std::forward<ArgTypes>(Args)...);
 
@@ -1359,7 +1356,7 @@ struct Struct16B {
 struct Struct32B {
   alignas(32) void *X;
 };
-}
+}  // namespace
 static_assert(sizeof(small_vector<void *, 0>) ==
                   sizeof(unsigned) * 2 + sizeof(void *),
               "wasted space in small_vector size 0");
diff --git a/paddle/utils/string/piece.cc b/paddle/utils/string/piece.cc
index 305ac85a532..ae62f53378f 100644
--- a/paddle/utils/string/piece.cc
+++ b/paddle/utils/string/piece.cc
@@ -15,6 +15,7 @@
 #include "paddle/utils/string/piece.h"
 
 #include <string.h>
+
 #include <algorithm>
 #define CHAR_POINTER_CMP(a, b) \
   do {                         \
@@ -71,8 +72,9 @@ bool operator<=(Piece x, Piece y) { return Compare(x, y) <= 0; }
 bool operator>=(Piece x, Piece y) { return Compare(x, y) >= 0; }
 
 bool HasPrefix(Piece s, Piece x) {
-  return !x.len() ? true : ((s.len() >= x.len()) &&
-                            (memcmp(s.data(), x.data(), x.len()) == 0));
+  return !x.len() ? true
+                  : ((s.len() >= x.len()) &&
+                     (memcmp(s.data(), x.data(), x.len()) == 0));
 }
 
 bool HasSuffix(Piece s, Piece x) {
diff --git a/paddle/utils/string/pretty_log.cc b/paddle/utils/string/pretty_log.cc
index b014c6de20d..90d3a6c1c4c 100644
--- a/paddle/utils/string/pretty_log.cc
+++ b/paddle/utils/string/pretty_log.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/utils/string/pretty_log.h"
+
 #include "gflags/gflags.h"
 
 DEFINE_bool(color, true, "Whether to turn on pretty log");
diff --git a/paddle/utils/string/pretty_log.h b/paddle/utils/string/pretty_log.h
index 9a8038f3a8b..9de7ce24abd 100644
--- a/paddle/utils/string/pretty_log.h
+++ b/paddle/utils/string/pretty_log.h
@@ -17,8 +17,8 @@
 #include <sstream>
 #include <string>
 #include <utility>
-#include "gflags/gflags.h"
 
+#include "gflags/gflags.h"
 #include "paddle/utils/string/printf.h"
 
 DECLARE_bool(color);
@@ -59,30 +59,30 @@ struct Style {
 template <typename... Args>
 static void PrettyLogEndl(const std::string &style,
                           const char *fmt,
-                          const Args &... args) {
+                          const Args &...args) {
   std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
 }
 template <typename... Args>
 static void PrettyLog(const std::string &style,
                       const char *fmt,
-                      const Args &... args) {
+                      const Args &...args) {
   std::cerr << style << Sprintf(fmt, args...) << reset();
 }
 
 template <typename... Args>
-static void PrettyLogInfo(const char *fmt, const Args &... args) {
+static void PrettyLogInfo(const char *fmt, const Args &...args) {
   PrettyLogEndl(Style::info(), fmt, args...);
 }
 template <typename... Args>
-static void PrettyLogDetail(const char *fmt, const Args &... args) {
+static void PrettyLogDetail(const char *fmt, const Args &...args) {
   PrettyLogEndl(Style::detail(), fmt, args...);
 }
 template <typename... Args>
-static void PrettyLogH1(const char *fmt, const Args &... args) {
+static void PrettyLogH1(const char *fmt, const Args &...args) {
   PrettyLogEndl(Style::H1(), fmt, args...);
 }
 template <typename... Args>
-static void PrettyLogH2(const char *fmt, const Args &... args) {
+static void PrettyLogH2(const char *fmt, const Args &...args) {
   PrettyLogEndl(Style::H2(), fmt, args...);
 }
 
diff --git a/paddle/utils/string/string_helper.cc b/paddle/utils/string/string_helper.cc
index 37b9e9ce4e5..2b694de4b58 100644
--- a/paddle/utils/string/string_helper.cc
+++ b/paddle/utils/string/string_helper.cc
@@ -16,6 +16,7 @@
 
 #include <ctype.h>
 #include <stdio.h>
+
 #include <cstring>
 #include <string>
 
diff --git a/paddle/utils/string/string_helper.h b/paddle/utils/string/string_helper.h
index e6cb2e90b8f..f34ae49fcfa 100644
--- a/paddle/utils/string/string_helper.h
+++ b/paddle/utils/string/string_helper.h
@@ -17,6 +17,7 @@
 #include <assert.h>
 #include <ctype.h>
 #include <stdio.h>
+
 #include <cstring>
 #include <sstream>
 #include <string>
diff --git a/paddle/utils/string/tinyformat/tinyformat.h b/paddle/utils/string/tinyformat/tinyformat.h
index 4e46cbc26b6..f9c55fe1835 100644
--- a/paddle/utils/string/tinyformat/tinyformat.h
+++ b/paddle/utils/string/tinyformat/tinyformat.h
@@ -846,7 +846,7 @@ template <int N>
 class FormatListN : public FormatList {
  public:
   template <typename... Args>
-  FormatListN(const Args &... args)  // NOLINT
+  FormatListN(const Args &...args)  // NOLINT
       : FormatList(&m_formatterStore[0], N),
         m_formatterStore{FormatArg(args)...} {
     static_assert(sizeof...(args) == N, "Number of args must be N");
@@ -875,7 +875,7 @@ class FormatListN<0> : public FormatList {
 ///
 ///   FormatListRef formatList = makeFormatList( /*...*/ );
 template <typename... Args>
-detail::FormatListN<sizeof...(Args)> makeFormatList(const Args &... args) {
+detail::FormatListN<sizeof...(Args)> makeFormatList(const Args &...args) {
   return detail::FormatListN<sizeof...(args)>(args...);
 }  // NOLINT
 
@@ -889,14 +889,14 @@ inline void vformat(std::ostream &out, const char *fmt, FormatListRef list) {
 
 /// Format list of arguments to the stream according to given format string.
 template <typename... Args>
-void format(std::ostream &out, const char *fmt, const Args &... args) {
+void format(std::ostream &out, const char *fmt, const Args &...args) {
   vformat(out, fmt, makeFormatList(args...));
 }
 
 /// Format list of arguments according to the given format string and return
 /// the result as a string.
 template <typename... Args>
-std::string format(const char *fmt, const Args &... args) {
+std::string format(const char *fmt, const Args &...args) {
   std::ostringstream oss;
   format(oss, fmt, args...);
   return oss.str();
@@ -904,12 +904,12 @@ std::string format(const char *fmt, const Args &... args) {
 
 /// Format list of arguments to std::cout, according to the given format string
 template <typename... Args>
-void printf(const char *fmt, const Args &... args) {
+void printf(const char *fmt, const Args &...args) {
   format(std::cout, fmt, args...);
 }
 
 template <typename... Args>
-void printfln(const char *fmt, const Args &... args) {
+void printfln(const char *fmt, const Args &...args) {
   format(std::cout, fmt, args...);
   std::cout << '\n';
 }
diff --git a/paddle/utils/string/to_string_test.cc b/paddle/utils/string/to_string_test.cc
index 778ba8bb113..740e4435fc3 100644
--- a/paddle/utils/string/to_string_test.cc
+++ b/paddle/utils/string/to_string_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/utils/string/to_string.h"
+
 #include <gtest/gtest.h>
 
 constexpr char kOutputString[] = "User Defined Output";
diff --git a/paddle/utils/variant.h b/paddle/utils/variant.h
index 4348abc9cbf..a6822920dd7 100644
--- a/paddle/utils/variant.h
+++ b/paddle/utils/variant.h
@@ -429,10 +429,10 @@ inline constexpr remove_reference_t<T> &&move(T &&t) noexcept {
 }
 
 #ifdef MPARK_INTEGER_SEQUENCE
-using std::integer_sequence;
 using std::index_sequence;
-using std::make_index_sequence;
 using std::index_sequence_for;
+using std::integer_sequence;
+using std::make_index_sequence;
 #else
 template <typename T, T... Is>
 struct integer_sequence {
@@ -597,14 +597,14 @@ struct Invoke;
 template <>
 struct Invoke<true /* pmf */, 0 /* is_base_of */> {
   template <typename R, typename T, typename Arg, typename... Args>
-  inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&... args)
+  inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&...args)
       MPARK_RETURN((lib::forward<Arg>(arg).*pmf)(lib::forward<Args>(args)...))
 };
 
 template <>
 struct Invoke<true /* pmf */, 1 /* is_reference_wrapper */> {
   template <typename R, typename T, typename Arg, typename... Args>
-  inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&... args)
+  inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&...args)
       MPARK_RETURN((lib::forward<Arg>(arg).get().*
                     pmf)(lib::forward<Args>(args)...))
 };
@@ -612,7 +612,7 @@ struct Invoke<true /* pmf */, 1 /* is_reference_wrapper */> {
 template <>
 struct Invoke<true /* pmf */, 2 /* otherwise */> {
   template <typename R, typename T, typename Arg, typename... Args>
-  inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&... args)
+  inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&...args)
       MPARK_RETURN(((*lib::forward<Arg>(arg)).*
                     pmf)(lib::forward<Args>(args)...))
 };
@@ -639,27 +639,29 @@ struct Invoke<false /* pmo */, 2 /* otherwise */> {
 };
 
 template <typename R, typename T, typename Arg, typename... Args>
-inline constexpr auto invoke(R T::*f, Arg &&arg, Args &&... args) MPARK_RETURN(
-    Invoke<std::is_function<R>::value,
-           (std::is_base_of<T, lib::decay_t<Arg>>::value
-                ? 0
-                : is_reference_wrapper<lib::decay_t<Arg>>::value ? 1 : 2)>::
-        invoke(f, lib::forward<Arg>(arg), lib::forward<Args>(args)...))
+inline constexpr auto invoke(R T::*f, Arg &&arg, Args &&...args)
+    MPARK_RETURN(Invoke<std::is_function<R>::value,
+                        (std::is_base_of<T, lib::decay_t<Arg>>::value ? 0
+                         : is_reference_wrapper<lib::decay_t<Arg>>::value
+                             ? 1
+                             : 2)>::invoke(f,
+                                           lib::forward<Arg>(arg),
+                                           lib::forward<Args>(args)...))
 
 #ifdef _MSC_VER
 #pragma warning(push)
 #pragma warning(disable : 4100)
 #endif
-    template <typename F, typename... Args>
-    inline constexpr auto invoke(F &&f, Args &&... args)
-        MPARK_RETURN(lib::forward<F>(f)(lib::forward<Args>(args)...))
+        template <typename F, typename... Args>
+        inline constexpr auto invoke(F &&f, Args &&...args)
+            MPARK_RETURN(lib::forward<F>(f)(lib::forward<Args>(args)...))
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
 }  // namespace detail
 
 template <typename F, typename... Args>
-inline constexpr auto invoke(F &&f, Args &&... args)
+inline constexpr auto invoke(F &&f, Args &&...args)
     MPARK_RETURN(detail::invoke(lib::forward<F>(f),
                                 lib::forward<Args>(args)...))
 
@@ -842,10 +844,10 @@ using type_pack_element_t = typename type_pack_element<I, Ts...>::type;
 #endif
 
 #ifdef MPARK_TRIVIALITY_TYPE_TRAITS
-using std::is_trivially_copy_constructible;
-using std::is_trivially_move_constructible;
 using std::is_trivially_copy_assignable;
+using std::is_trivially_copy_constructible;
 using std::is_trivially_move_assignable;
+using std::is_trivially_move_constructible;
 #else
 template <typename T>
 struct is_trivially_copy_constructible
@@ -1049,12 +1051,14 @@ struct valueless_t {};
 enum class Trait { TriviallyAvailable, Available, Unavailable };
 
 template <typename T,
-          template <typename> class IsTriviallyAvailable,
-          template <typename> class IsAvailable>
+          template <typename>
+          class IsTriviallyAvailable,
+          template <typename>
+          class IsAvailable>
 inline constexpr Trait trait() {
-  return IsTriviallyAvailable<T>::value
-             ? Trait::TriviallyAvailable
-             : IsAvailable<T>::value ? Trait::Available : Trait::Unavailable;
+  return IsTriviallyAvailable<T>::value ? Trait::TriviallyAvailable
+         : IsAvailable<T>::value        ? Trait::Available
+                                        : Trait::Unavailable;
 }
 
 #ifdef MPARK_CPP14_CONSTEXPR
@@ -1195,7 +1199,7 @@ struct base {
 
     template <typename Visitor, typename... Alts>
     inline static constexpr DECLTYPE_AUTO invoke(Visitor &&visitor,
-                                                 Alts &&... alts)
+                                                 Alts &&...alts)
         DECLTYPE_AUTO_RETURN(lib::invoke(lib::forward<Visitor>(visitor),
                                          lib::forward<Alts>(alts)...))
   };
@@ -1230,12 +1234,12 @@ struct base {
   struct dispatcher<true, R, ITs...> {
     template <std::size_t B, typename F>
     MPARK_ALWAYS_INLINE static constexpr R dispatch(
-        F &&f, typename ITs::type &&... visited_vs) {
+        F &&f, typename ITs::type &&...visited_vs) {
       using Expected = R;
-      using Actual = decltype(
-          lib::invoke(lib::forward<F>(f),
-                      access::base::get_alt<ITs::value>(
-                          lib::forward<typename ITs::type>(visited_vs))...));
+      using Actual = decltype(lib::invoke(
+          lib::forward<F>(f),
+          access::base::get_alt<ITs::value>(
+              lib::forward<typename ITs::type>(visited_vs))...));
       return visit_return_type_check<Expected, Actual>::invoke(
           lib::forward<F>(f),
           access::base::get_alt<ITs::value>(
@@ -1244,7 +1248,7 @@ struct base {
 
     template <std::size_t B, typename F, typename V, typename... Vs>
     MPARK_ALWAYS_INLINE static constexpr R dispatch(
-        F &&f, typename ITs::type &&... visited_vs, V &&v, Vs &&... vs) {
+        F &&f, typename ITs::type &&...visited_vs, V &&v, Vs &&...vs) {
 #define MPARK_DISPATCH(I)                                                   \
   dispatcher<(I < lib::decay_t<V>::size()),                                 \
              R,                                                             \
@@ -1336,11 +1340,11 @@ struct base {
     }
 
     template <std::size_t I, typename F, typename... Vs>
-    MPARK_ALWAYS_INLINE static constexpr R dispatch_case(F &&f, Vs &&... vs) {
+    MPARK_ALWAYS_INLINE static constexpr R dispatch_case(F &&f, Vs &&...vs) {
       using Expected = R;
-      using Actual = decltype(
-          lib::invoke(lib::forward<F>(f),
-                      access::base::get_alt<I>(lib::forward<Vs>(vs))...));
+      using Actual = decltype(lib::invoke(
+          lib::forward<F>(f),
+          access::base::get_alt<I>(lib::forward<Vs>(vs))...));
       return visit_return_type_check<Expected, Actual>::invoke(
           lib::forward<F>(f),
           access::base::get_alt<I>(lib::forward<Vs>(vs))...);
@@ -1350,7 +1354,7 @@ struct base {
     MPARK_ALWAYS_INLINE static constexpr R dispatch_at(std::size_t index,
                                                        F &&f,
                                                        V &&v,
-                                                       Vs &&... vs) {
+                                                       Vs &&...vs) {
       static_assert(lib::all<(lib::decay_t<V>::size() ==
                               lib::decay_t<Vs>::size())...>::value,
                     "all of the variants must be the same size.");
@@ -1449,7 +1453,7 @@ struct base {
 
   template <typename F, typename... Fs>
   inline static constexpr lib::array<lib::decay_t<F>, sizeof...(Fs) + 1>
-  make_farray(F &&f, Fs &&... fs) {
+  make_farray(F &&f, Fs &&...fs) {
     return {{lib::forward<F>(f), lib::forward<Fs>(fs)...}};
   }
 
@@ -1457,11 +1461,11 @@ struct base {
   struct make_fmatrix_impl {
     template <std::size_t... Is>
     inline static constexpr dispatch_result_t<F, Vs...> dispatch(F &&f,
-                                                                 Vs &&... vs) {
+                                                                 Vs &&...vs) {
       using Expected = dispatch_result_t<F, Vs...>;
-      using Actual = decltype(
-          lib::invoke(lib::forward<F>(f),
-                      access::base::get_alt<Is>(lib::forward<Vs>(vs))...));
+      using Actual = decltype(lib::invoke(
+          lib::forward<F>(f),
+          access::base::get_alt<Is>(lib::forward<Vs>(vs))...));
       return visit_return_type_check<Expected, Actual>::invoke(
           lib::forward<F>(f),
           access::base::get_alt<Is>(lib::forward<Vs>(vs))...);
@@ -1515,11 +1519,11 @@ struct base {
   struct make_fdiagonal_impl {
     template <std::size_t I>
     inline static constexpr dispatch_result_t<F, Vs...> dispatch(F &&f,
-                                                                 Vs &&... vs) {
+                                                                 Vs &&...vs) {
       using Expected = dispatch_result_t<F, Vs...>;
-      using Actual = decltype(
-          lib::invoke(lib::forward<F>(f),
-                      access::base::get_alt<I>(lib::forward<Vs>(vs))...));
+      using Actual = decltype(lib::invoke(
+          lib::forward<F>(f),
+          access::base::get_alt<I>(lib::forward<Vs>(vs))...));
       return visit_return_type_check<Expected, Actual>::invoke(
           lib::forward<F>(f),
           access::base::get_alt<I>(lib::forward<Vs>(vs))...);
@@ -1571,8 +1575,7 @@ constexpr fdiagonal_t<F, Vs...> fdiagonal<F, Vs...>::value;
 
 struct alt {
   template <typename Visitor, typename... Vs>
-  inline static constexpr DECLTYPE_AUTO visit_alt(Visitor &&visitor,
-                                                  Vs &&... vs)
+  inline static constexpr DECLTYPE_AUTO visit_alt(Visitor &&visitor, Vs &&...vs)
 #ifdef MPARK_VARIANT_SWITCH_VISIT
       DECLTYPE_AUTO_RETURN(
           base::dispatcher<true,
@@ -1597,7 +1600,7 @@ struct alt {
 
           template <typename Visitor, typename... Vs>
           inline static constexpr DECLTYPE_AUTO
-      visit_alt_at(std::size_t index, Visitor &&visitor, Vs &&... vs)
+      visit_alt_at(std::size_t index, Visitor &&visitor, Vs &&...vs)
 #ifdef MPARK_VARIANT_SWITCH_VISIT
           DECLTYPE_AUTO_RETURN(
               base::dispatcher<
@@ -1640,7 +1643,7 @@ struct variant {
                   "`visit` requires the visitor to be exhaustive.");
 
     inline static constexpr DECLTYPE_AUTO invoke(Visitor &&visitor,
-                                                 Values &&... values)
+                                                 Values &&...values)
         DECLTYPE_AUTO_RETURN(lib::invoke(lib::forward<Visitor>(visitor),
                                          lib::forward<Values>(values)...))
   };
@@ -1650,7 +1653,7 @@ struct variant {
     Visitor &&visitor_;
 
     template <typename... Alts>
-    inline constexpr DECLTYPE_AUTO operator()(Alts &&... alts) const
+    inline constexpr DECLTYPE_AUTO operator()(Alts &&...alts) const
         DECLTYPE_AUTO_RETURN(visit_exhaustiveness_check<
                              Visitor,
                              decltype((lib::forward<Alts>(alts).value))...>::
@@ -1665,13 +1668,13 @@ struct variant {
           public
       : template <typename Visitor, typename... Vs>
         inline static constexpr DECLTYPE_AUTO
-        visit_alt(Visitor &&visitor, Vs &&... vs)
+        visit_alt(Visitor &&visitor, Vs &&...vs)
             DECLTYPE_AUTO_RETURN(alt::visit_alt(lib::forward<Visitor>(visitor),
                                                 lib::forward<Vs>(vs).impl_...))
 
                 template <typename Visitor, typename... Vs>
                 inline static constexpr DECLTYPE_AUTO
-        visit_alt_at(std::size_t index, Visitor &&visitor, Vs &&... vs)
+        visit_alt_at(std::size_t index, Visitor &&visitor, Vs &&...vs)
             DECLTYPE_AUTO_RETURN(
                 alt::visit_alt_at(index,
                                   lib::forward<Visitor>(visitor),
@@ -1679,13 +1682,13 @@ struct variant {
 
                 template <typename Visitor, typename... Vs>
                 inline static constexpr DECLTYPE_AUTO
-        visit_value(Visitor &&visitor, Vs &&... vs) DECLTYPE_AUTO_RETURN(
+        visit_value(Visitor &&visitor, Vs &&...vs) DECLTYPE_AUTO_RETURN(
             visit_alt(make_value_visitor(lib::forward<Visitor>(visitor)),
                       lib::forward<Vs>(vs)...))
 
             template <typename Visitor, typename... Vs>
             inline static constexpr DECLTYPE_AUTO
-        visit_value_at(std::size_t index, Visitor &&visitor, Vs &&... vs)
+        visit_value_at(std::size_t index, Visitor &&visitor, Vs &&...vs)
             DECLTYPE_AUTO_RETURN(
                 visit_alt_at(index,
                              make_value_visitor(lib::forward<Visitor>(visitor)),
@@ -1703,7 +1706,7 @@ struct alt {
 #pragma warning(disable : 4244)
 #endif
   template <typename... Args>
-  inline explicit constexpr alt(in_place_t, Args &&... args)
+  inline explicit constexpr alt(in_place_t, Args &&...args)
       : value(lib::forward<Args>(args)...) {}
 #ifdef _MSC_VER
 #pragma warning(pop)
@@ -1727,12 +1730,12 @@ union recursive_union<DestructibleTrait, Index> {};
                                                                            \
     template <typename... Args>                                            \
     inline explicit constexpr recursive_union(in_place_index_t<0>,         \
-                                              Args &&... args)             \
+                                              Args &&...args)              \
         : head_(in_place_t{}, lib::forward<Args>(args)...) {}              \
                                                                            \
     template <std::size_t I, typename... Args>                             \
     inline explicit constexpr recursive_union(in_place_index_t<I>,         \
-                                              Args &&... args)             \
+                                              Args &&...args)              \
         : tail_(in_place_index_t<I - 1>{}, lib::forward<Args>(args)...) {} \
                                                                            \
     recursive_union(const recursive_union &) = default;                    \
@@ -1765,11 +1768,10 @@ template <Trait DestructibleTrait, typename... Ts>
 class base {
  public:
   inline explicit constexpr base(valueless_t tag) noexcept
-      : data_(tag),
-        index_(static_cast<index_t>(-1)) {}
+      : data_(tag), index_(static_cast<index_t>(-1)) {}
 
   template <std::size_t I, typename... Args>
-  inline explicit constexpr base(in_place_index_t<I>, Args &&... args)
+  inline explicit constexpr base(in_place_index_t<I>, Args &&...args)
       : data_(in_place_index_t<I>{}, lib::forward<Args>(args)...), index_(I) {}
 
   inline constexpr bool valueless_by_exception() const noexcept {
@@ -1823,9 +1825,9 @@ struct dtor {
 #if !defined(_MSC_VER) || _MSC_VER >= 1910
 #define MPARK_INHERITING_CTOR(type, base) using base::base;
 #else
-#define MPARK_INHERITING_CTOR(type, base)         \
-  template <typename... Args>                     \
-  inline explicit constexpr type(Args &&... args) \
+#define MPARK_INHERITING_CTOR(type, base)        \
+  template <typename... Args>                    \
+  inline explicit constexpr type(Args &&...args) \
       : base(lib::forward<Args>(args)...) {}
 #endif
 
@@ -1851,19 +1853,21 @@ class destructor;
     destroy                                                               \
   }
 
-MPARK_VARIANT_DESTRUCTOR(Trait::TriviallyAvailable, ~destructor() = default;
-                         , inline void destroy() noexcept {
-                           this->index_ = static_cast<index_t>(-1);
-                         });
+MPARK_VARIANT_DESTRUCTOR(
+    Trait::TriviallyAvailable, ~destructor() = default;
+    , inline void destroy() noexcept {
+      this->index_ = static_cast<index_t>(-1);
+    });
 
-MPARK_VARIANT_DESTRUCTOR(Trait::Available,
-                         ~destructor() { destroy(); },
-                         inline void destroy() noexcept {
-                           if (!this->valueless_by_exception()) {
-                             visitation::alt::visit_alt(dtor{}, *this);
-                           }
-                           this->index_ = static_cast<index_t>(-1);
-                         });
+MPARK_VARIANT_DESTRUCTOR(
+    Trait::Available,
+    ~destructor() { destroy(); },
+    inline void destroy() noexcept {
+      if (!this->valueless_by_exception()) {
+        visitation::alt::visit_alt(dtor{}, *this);
+      }
+      this->index_ = static_cast<index_t>(-1);
+    });
 
 MPARK_VARIANT_DESTRUCTOR(Trait::Unavailable, ~destructor() = delete;
                          , inline void destroy() noexcept = delete;);
@@ -1889,7 +1893,7 @@ class constructor : public destructor<Traits> {
 #endif
 
   template <std::size_t I, typename T, typename... Args>
-  inline static T &construct_alt(alt<I, T> &a, Args &&... args) {
+  inline static T &construct_alt(alt<I, T> &a, Args &&...args) {
     auto *result = ::new (static_cast<void *>(lib::addressof(a)))
         alt<I, T>(in_place_t{}, lib::forward<Args>(args)...);
     return result->value;
@@ -1976,11 +1980,11 @@ MPARK_VARIANT_COPY_CONSTRUCTOR(
     Trait::TriviallyAvailable,
     copy_constructor(const copy_constructor &that) = default;);
 
-MPARK_VARIANT_COPY_CONSTRUCTOR(Trait::Available,
-                               copy_constructor(const copy_constructor &that)
-                               : copy_constructor(valueless_t{}) {
-                                 this->generic_construct(*this, that);
-                               });
+MPARK_VARIANT_COPY_CONSTRUCTOR(
+    Trait::Available, copy_constructor(const copy_constructor &that)
+    : copy_constructor(valueless_t{}) {
+      this->generic_construct(*this, that);
+    });
 
 MPARK_VARIANT_COPY_CONSTRUCTOR(
     Trait::Unavailable, copy_constructor(const copy_constructor &) = delete;);
@@ -1996,7 +2000,7 @@ class assignment : public copy_constructor<Traits> {
   using super::operator=;
 
   template <std::size_t I, typename... Args>
-  inline /* auto & */ auto emplace(Args &&... args)
+  inline /* auto & */ auto emplace(Args &&...args)
       -> decltype(this->construct_alt(access::base::get_alt<I>(*this),
                                       lib::forward<Args>(args)...)) {
     this->destroy();
@@ -2161,18 +2165,19 @@ class impl : public copy_assignment<traits<Ts...>> {
     if (this->valueless_by_exception() && that.valueless_by_exception()) {
       // do nothing.
     } else if (this->index() == that.index()) {
-      visitation::alt::visit_alt_at(this->index(),
+      visitation::alt::visit_alt_at(
+          this->index(),
 #ifdef MPARK_GENERIC_LAMBDAS
-                                    [](auto &this_alt, auto &that_alt) {
-                                      using std::swap;
-                                      swap(this_alt.value, that_alt.value);
-                                    }
+          [](auto &this_alt, auto &that_alt) {
+            using std::swap;
+            swap(this_alt.value, that_alt.value);
+          }
 #else
-                                    swapper {}
+          swapper {}
 #endif
-                                    ,
-                                    *this,
-                                    that);
+          ,
+          *this,
+          that);
     } else {
       impl *lhs = this;
       impl *rhs = lib::addressof(that);
@@ -2275,7 +2280,7 @@ struct is_in_place_type : std::false_type {};
 template <typename T>
 struct is_in_place_type<in_place_type_t<T>> : std::true_type {};
 
-}  // detail
+}  // namespace detail
 
 template <typename... Ts>
 class variant {
@@ -2321,8 +2326,7 @@ class variant {
             lib::enable_if_t<std::is_constructible<T, Args...>::value, int> = 0>
   inline explicit constexpr variant(
       in_place_index_t<I>,
-      Args
-          &&... args) noexcept(std::is_nothrow_constructible<T, Args...>::value)
+      Args &&...args) noexcept(std::is_nothrow_constructible<T, Args...>::value)
       : impl_(in_place_index_t<I>{}, lib::forward<Args>(args)...) {}
 
   template <
@@ -2336,11 +2340,11 @@ class variant {
   inline explicit constexpr variant(
       in_place_index_t<I>,
       std::initializer_list<Up> il,
-      Args &&... args) noexcept(std::
-                                    is_nothrow_constructible<
-                                        T,
-                                        std::initializer_list<Up> &,
-                                        Args...>::value)
+      Args &&...args) noexcept(std::
+                                   is_nothrow_constructible<
+                                       T,
+                                       std::initializer_list<Up> &,
+                                       Args...>::value)
       : impl_(in_place_index_t<I>{}, il, lib::forward<Args>(args)...) {}
 
   template <typename T,
@@ -2349,8 +2353,7 @@ class variant {
             lib::enable_if_t<std::is_constructible<T, Args...>::value, int> = 0>
   inline explicit constexpr variant(
       in_place_type_t<T>,
-      Args
-          &&... args) noexcept(std::is_nothrow_constructible<T, Args...>::value)
+      Args &&...args) noexcept(std::is_nothrow_constructible<T, Args...>::value)
       : impl_(in_place_index_t<I>{}, lib::forward<Args>(args)...) {}
 
   template <
@@ -2364,11 +2367,11 @@ class variant {
   inline explicit constexpr variant(
       in_place_type_t<T>,
       std::initializer_list<Up> il,
-      Args &&... args) noexcept(std::
-                                    is_nothrow_constructible<
-                                        T,
-                                        std::initializer_list<Up> &,
-                                        Args...>::value)
+      Args &&...args) noexcept(std::
+                                   is_nothrow_constructible<
+                                       T,
+                                       std::initializer_list<Up> &,
+                                       Args...>::value)
       : impl_(in_place_index_t<I>{}, il, lib::forward<Args>(args)...) {}
 
   ~variant() = default;
@@ -2395,7 +2398,7 @@ class variant {
             typename... Args,
             typename T = lib::type_pack_element_t<I, Ts...>,
             lib::enable_if_t<std::is_constructible<T, Args...>::value, int> = 0>
-  inline T &emplace(Args &&... args) {
+  inline T &emplace(Args &&...args) {
     return impl_.template emplace<I>(lib::forward<Args>(args)...);
   }
 
@@ -2407,7 +2410,7 @@ class variant {
       lib::enable_if_t<
           std::is_constructible<T, std::initializer_list<Up> &, Args...>::value,
           int> = 0>
-  inline T &emplace(std::initializer_list<Up> il, Args &&... args) {
+  inline T &emplace(std::initializer_list<Up> il, Args &&...args) {
     return impl_.template emplace<I>(il, lib::forward<Args>(args)...);
   }
 
@@ -2415,7 +2418,7 @@ class variant {
             typename... Args,
             std::size_t I = detail::find_index_sfinae<T, Ts...>::value,
             lib::enable_if_t<std::is_constructible<T, Args...>::value, int> = 0>
-  inline T &emplace(Args &&... args) {
+  inline T &emplace(Args &&...args) {
     return impl_.template emplace<I>(lib::forward<Args>(args)...);
   }
 
@@ -2427,7 +2430,7 @@ class variant {
       lib::enable_if_t<
           std::is_constructible<T, std::initializer_list<Up> &, Args...>::value,
           int> = 0>
-  inline T &emplace(std::initializer_list<Up> il, Args &&... args) {
+  inline T &emplace(std::initializer_list<Up> il, Args &&...args) {
     return impl_.template emplace<I>(il, lib::forward<Args>(args)...);
   }
 
@@ -2534,10 +2537,10 @@ inline constexpr const T &&get(const variant<Ts...> &&v) {
 namespace detail {
 
 template <std::size_t I, typename V>
-inline constexpr /* auto * */ AUTO generic_get_if(V *v) noexcept AUTO_RETURN(
-    v &&holds_alternative<I>(*v)
-        ? lib::addressof(access::variant::get_alt<I>(*v).value)
-        : nullptr)
+inline constexpr /* auto * */ AUTO generic_get_if(V *v) noexcept
+    AUTO_RETURN(v &&holds_alternative<I>(*v)
+                    ? lib::addressof(access::variant::get_alt<I>(*v).value)
+                    : nullptr)
 
 }  // namespace detail
 
@@ -2720,7 +2723,7 @@ inline constexpr bool all(std::initializer_list<bool> bs) {
 }  // namespace detail
 
 template <typename Visitor, typename... Vs>
-inline constexpr decltype(auto) visit(Visitor &&visitor, Vs &&... vs) {
+inline constexpr decltype(auto) visit(Visitor &&visitor, Vs &&...vs) {
   return (detail::all(
               lib::array<bool, sizeof...(Vs)>{!vs.valueless_by_exception()...})
               ? (void)0
@@ -2744,7 +2747,7 @@ inline constexpr bool all(const lib::array<bool, N> &bs) {
 }  // namespace detail
 
 template <typename Visitor, typename... Vs>
-inline constexpr DECLTYPE_AUTO visit(Visitor &&visitor, Vs &&... vs)
+inline constexpr DECLTYPE_AUTO visit(Visitor &&visitor, Vs &&...vs)
     DECLTYPE_AUTO_RETURN(
         (detail::all(lib::array<bool, sizeof...(Vs)>{
              {!vs.valueless_by_exception()...}})
diff --git a/paddle/utils/variant_test.cc b/paddle/utils/variant_test.cc
index e690269d801..ef4a6cf8cd8 100644
--- a/paddle/utils/variant_test.cc
+++ b/paddle/utils/variant_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/utils/variant.h"
+
 #include "gtest/gtest.h"
 #include "paddle/phi/core/enforce.h"
 
diff --git a/tools/codestyle/clang_format.hook b/tools/codestyle/clang_format.hook
index 1d928216867..72608fd8b83 100755
--- a/tools/codestyle/clang_format.hook
+++ b/tools/codestyle/clang_format.hook
@@ -1,15 +1,12 @@
 #!/bin/bash
 set -e
 
-readonly VERSION="3.8"
+readonly VERSION="13.0.0"
 
 version=$(clang-format -version)
 
 if ! [[ $version == *"$VERSION"* ]]; then
-    echo "clang-format version check failed."
-    echo "a version contains '$VERSION' is needed, but get '$version'"
-    echo "you can install the right version, and make an soft-link to '\$PATH' env"
-    exit -1
+    pip install clang-format==13.0.0
 fi
 
 clang-format $@
-- 
GitLab