revert 40531 (#43807)

* revert 40531 * update

revert 40531 (#43807)
* revert 40531 * update
7985407b · Wilber · GitHub · 69717717 · 7985407b · 69717717
14 changed file
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -157,7 +157,6 @@ pass_library(layer_norm_fuse_pass inference)
 pass_library(add_support_int8_pass inference)
 pass_library(matmul_scale_fuse_pass inference)
 pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
-pass_library(mixed_precision_configure_pass inference)
 pass_library(dense_fc_to_sparse_pass inference)
 pass_library(dense_multihead_matmul_to_sparse_pass inference)
 pass_library(generate_pass DEPS pass_desc_proto)

--- a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
+++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h"
-
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void MixedPrecisionConfigurePass::InsertCastOps(
-    Graph* graph, const StringSet& blacklist) const {
-  VLOG(3) << "Insert the cast op before and after the kernel that does not "
-             "supports fp16 precision";
-
-  auto update_cast_desc = [&](framework::OpDesc& desc,
-                              const std::string& x_name,
-                              const std::string& out_name, const int in_dtype,
-                              const int out_dtype) {
-    desc.SetType("cast");
-    desc.SetInput("X", {x_name});
-    desc.SetOutput("Out", {out_name});
-    desc.SetAttr("in_dtype", in_dtype);
-    desc.SetAttr("out_dtype", out_dtype);
-    desc.SetAttr("use_mkldnn", false);
-    desc.SetAttr("with_quant_attr", false);
-    desc.Flush();
-  };
-
-  auto cast_input = [&](Graph* graph, Node* op_node,
-                        const StringSet& cast_list) {
-    auto inlinks = op_node->inputs;
-    for (auto* pre_node : inlinks) {
-      if (pre_node->IsVar()) {
-        const auto is_persistable = pre_node->Var()->Persistable();
-        const auto is_float =
-            pre_node->Var()->GetDataType() == proto::VarType::FP16 ||
-            pre_node->Var()->GetDataType() == proto::VarType::FP32 ||
-            pre_node->Var()->GetDataType() == proto::VarType::FP64;
-        if (!is_persistable && is_float) {
-          int suffix = 0;
-          for (auto* pre_node_input : pre_node->inputs) {
-            if (!pre_node_input->IsOp()) continue;
-            const auto& type = pre_node_input->Op()->Type();
-            if (!cast_list.count(type) && type != "cast") {
-              std::string old_name = pre_node->Name();
-              std::string new_name =
-                  old_name + "_cast.tmp_" + std::to_string(suffix);
-              suffix++;
-
-              framework::OpDesc new_op_desc(op_node->Op()->Block());
-              // 4 for fp16, 5 for fp32
-              update_cast_desc(new_op_desc, old_name, new_name, 4, 5);
-              auto* new_op = graph->CreateOpNode(&new_op_desc);
-
-              VarDesc out_var(new_name);
-              out_var.SetPersistable(false);
-              auto* node_var = graph->CreateVarNode(&out_var);
-
-              op_node->Op()->RenameInput(old_name, new_name);
-              IR_NODE_LINK_TO(pre_node, new_op);
-              IR_NODE_LINK_TO(new_op, node_var);
-              IR_NODE_LINK_TO(node_var, op_node);
-            }
-          }
-        }
-      }
-    }
-  };
-
-  auto cast_output = [&](Graph* graph, Node* op_node,
-                         const StringSet& cast_list) {
-    auto outlinks = op_node->outputs;
-    for (auto* next_node : outlinks) {
-      if (next_node->IsVar()) {
-        const auto is_persistable = next_node->Var()->Persistable();
-        const auto is_float =
-            next_node->Var()->GetDataType() == proto::VarType::FP16 ||
-            next_node->Var()->GetDataType() == proto::VarType::FP32 ||
-            next_node->Var()->GetDataType() == proto::VarType::FP64;
-        if (!is_persistable && is_float) {
-          int suffix = 0;
-          for (auto* next_node_output : next_node->outputs) {
-            if (!next_node_output->IsOp()) continue;
-
-            const auto& type = next_node_output->Op()->Type();
-            if (!cast_list.count(type) && type != "cast") {
-              std::string old_name = next_node->Name();
-              std::string new_name =
-                  old_name + "_cast.tmp_" + std::to_string(suffix);
-              suffix++;
-
-              framework::OpDesc new_op_desc(op_node->Op()->Block());
-              // 4 for fp16, 5 for fp32
-              update_cast_desc(new_op_desc, old_name, new_name, 5, 4);
-              auto* new_op = graph->CreateOpNode(&new_op_desc);
-
-              VarDesc out_var(new_name);
-              out_var.SetPersistable(false);
-              auto* node_var = graph->CreateVarNode(&out_var);
-
-              next_node_output->Op()->RenameInput(old_name, new_name);
-              IR_NODE_LINK_TO(next_node, new_op);
-              IR_NODE_LINK_TO(new_op, node_var);
-              IR_NODE_LINK_TO(node_var, next_node_output);
-            }
-          }
-        }
-      }
-    }
-  };
-
-  for (auto* op_node :
-       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
-    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
-        op_node->Op()->Type() == "fetch")
-      continue;
-
-    const auto& type = op_node->Op()->Type();
-    if (blacklist.count(type)) {
-      cast_input(graph, op_node, blacklist);
-      cast_output(graph, op_node, blacklist);
-    }
-  }
-}
-
-void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const {
-  const auto blacklist =
-      Get<std::unordered_set<std::string>>("gpu_fp16_disabled_op_types");
-  InsertCastOps(graph, blacklist);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(mixed_precision_configure_pass,
-              paddle::framework::ir::MixedPrecisionConfigurePass);
--- a/paddle/fluid/framework/ir/mixed_precision_configure_pass.h
+++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-using StringSet = std::unordered_set<std::string>;
-
-class MixedPrecisionConfigurePass : public FusePassBase {
- public:
-  MixedPrecisionConfigurePass() = default;
-  virtual ~MixedPrecisionConfigurePass() {}
-
- protected:
-  void ApplyImpl(Graph* graph) const override;
-
- private:
-  void InsertCastOps(Graph* graph, const StringSet& blacklist) const;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -80,7 +80,8 @@ struct Argument {
 public:                                                                 \
  type__& field__() {                                                    \
    PADDLE_ENFORCE_EQ(                                                   \
-        Has(#field__), true,                                             \
+        Has(#field__),                                                   \
+        true,                                                            \
        platform::errors::PreconditionNotMet("There is no such field")); \
    return field__##_;                                                   \
  }                                                                      \
@@ -100,10 +101,12 @@ struct Argument {
 #define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__)                  \
 public:                                                                    \
  type__& field__() {                                                       \
-    PADDLE_ENFORCE_NOT_NULL(field__##_, platform::errors::PreconditionNotMet( \
-                                            "filed should not be null."));    \
+    PADDLE_ENFORCE_NOT_NULL(                                                \
+        field__##_,                                                         \
+        platform::errors::PreconditionNotMet("filed should not be null.")); \
    PADDLE_ENFORCE_EQ(                                                      \
-        Has(#field__), true,                                                  \
+        Has(#field__),                                                      \
+        true,                                                               \
        platform::errors::PreconditionNotMet("There is no such field"));    \
    return *static_cast<type__*>(field__##_.get());                         \
  }                                                                         \
@@ -119,13 +122,15 @@ struct Argument {
  DECL_ARGUMENT_FIELD_VALID(field__);                                       \
  type__* field__##_ptr() {                                                 \
    PADDLE_ENFORCE_EQ(                                                      \
-        Has(#field__), true,                                                  \
+        Has(#field__),                                                      \
+        true,                                                               \
        platform::errors::PreconditionNotMet("There is no such field"));    \
    return static_cast<type__*>(field__##_.get());                          \
  }                                                                         \
  type__* Release##Field() {                                                \
    PADDLE_ENFORCE_EQ(                                                      \
-        Has(#field__), true,                                                  \
+        Has(#field__),                                                      \
+        true,                                                               \
        platform::errors::PreconditionNotMet("There is no such field"));    \
    valid_fields_.erase(#field__);                                          \
    return static_cast<type__*>(field__##_.release());                      \
@@ -153,34 +158,40 @@ struct Argument {
  DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc);

  // The ir passes to perform in analysis phase.
-  DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
+  DECL_ARGUMENT_FIELD(ir_analysis_passes,
+                      IrAnalysisPasses,
                      std::vector<std::string>);
-  DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses,
+  DECL_ARGUMENT_FIELD(analysis_passes,
+                      AnalysisPasses,
                      std::vector<std::string>);

  // whether to mute all logs in inference.
  DECL_ARGUMENT_FIELD(disable_logs, DisableLogs, bool);

  // Pass a set of op types to enable its mkldnn kernel
-  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
+  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types,
+                      MKLDNNEnabledOpTypes,
                      std::unordered_set<std::string>);
  // The cache capacity of different input shapes for mkldnn.
  DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, MkldnnCacheCapacity, int);

 #ifdef PADDLE_WITH_MKLDNN
  // A set of op types to enable their quantized kernels
-  DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes,
+  DECL_ARGUMENT_FIELD(quantize_enabled_op_types,
+                      QuantizeEnabledOpTypes,
                      std::unordered_set<std::string>);

  // A set of op IDs to exclude from enabling their quantized kernels
-  DECL_ARGUMENT_FIELD(quantize_excluded_op_ids, QuantizeExcludedOpIds,
+  DECL_ARGUMENT_FIELD(quantize_excluded_op_ids,
+                      QuantizeExcludedOpIds,
                      std::unordered_set<int>);

  // Scales for variables to be quantized
  DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);

  // A set of op types to enable their bfloat16 kernels
-  DECL_ARGUMENT_FIELD(bfloat16_enabled_op_types, Bfloat16EnabledOpTypes,
+  DECL_ARGUMENT_FIELD(bfloat16_enabled_op_types,
+                      Bfloat16EnabledOpTypes,
                      std::unordered_set<std::string>);

  DECL_ARGUMENT_FIELD(use_mkldnn_int8, UseMkldnnInt8, bool);
@@ -190,9 +201,6 @@ struct Argument {
  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
  DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool);
  DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
-  DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool);
-  DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes,
-                      std::unordered_set<std::string>);

  // Usually use for trt dynamic shape.
  // TRT will select the best kernel according to opt shape
@@ -209,25 +217,33 @@ struct Argument {
  DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
-  DECL_ARGUMENT_FIELD(tensorrt_disabled_ops, TensorRtDisabledOPs,
+  DECL_ARGUMENT_FIELD(tensorrt_disabled_ops,
+                      TensorRtDisabledOPs,
                      std::vector<std::string>);
-  DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
+  DECL_ARGUMENT_FIELD(tensorrt_precision_mode,
+                      TensorRtPrecisionMode,
                      AnalysisConfig::Precision);
-  DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
+  DECL_ARGUMENT_FIELD(tensorrt_use_static_engine,
+                      TensorRtUseStaticEngine,
                      bool);
  DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
  DECL_ARGUMENT_FIELD(tensorrt_use_varseqlen, TensorRtUseOSS, bool);
  DECL_ARGUMENT_FIELD(tensorrt_with_interleaved, TensorRtWithInterleaved, bool);
-  DECL_ARGUMENT_FIELD(tensorrt_transformer_posid, TensorRtTransformerPosid,
+  DECL_ARGUMENT_FIELD(tensorrt_transformer_posid,
+                      TensorRtTransformerPosid,
                      std::string);
-  DECL_ARGUMENT_FIELD(tensorrt_transformer_maskid, TensorRtTransformerMaskid,
+  DECL_ARGUMENT_FIELD(tensorrt_transformer_maskid,
+                      TensorRtTransformerMaskid,
                      std::string);
  DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path,
-                      TensorRtShapeRangeInfoPath, std::string);
-  DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape,
+                      TensorRtShapeRangeInfoPath,
+                      std::string);
+  DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape,
+                      TensorRtTunedDynamicShape,
                      bool);
  DECL_ARGUMENT_FIELD(tensorrt_allow_build_at_runtime,
-                      TensorRtAllowBuildAtRuntime, bool);
+                      TensorRtAllowBuildAtRuntime,
+                      bool);
  DECL_ARGUMENT_FIELD(tensorrt_use_inspector, TensorRtUseInspector, bool);

  DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
@@ -235,10 +251,12 @@ struct Argument {
  DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int);
  DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int);

-  DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter,
+  DECL_ARGUMENT_FIELD(lite_passes_filter,
+                      LitePassesFilter,
                      std::vector<std::string>);
  DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
-  DECL_ARGUMENT_FIELD(lite_precision_mode, LitePrecisionMode,
+  DECL_ARGUMENT_FIELD(lite_precision_mode,
+                      LitePrecisionMode,
                      AnalysisConfig::Precision);
  DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);

@@ -252,19 +270,26 @@ struct Argument {
  DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);

  DECL_ARGUMENT_FIELD(use_nnadapter, UseNNAdapter, bool);
-  DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir, NNAdapterModelCacheDir,
+  DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir,
+                      NNAdapterModelCacheDir,
                      std::string);
-  DECL_ARGUMENT_FIELD(nnadapter_device_names, NNAdapterDeviceNames,
+  DECL_ARGUMENT_FIELD(nnadapter_device_names,
+                      NNAdapterDeviceNames,
                      std::vector<std::string>);
-  DECL_ARGUMENT_FIELD(nnadapter_context_properties, NNAdapterContextProperties,
+  DECL_ARGUMENT_FIELD(nnadapter_context_properties,
+                      NNAdapterContextProperties,
                      std::string);
  DECL_ARGUMENT_FIELD(nnadapter_subgraph_partition_config_buffer,
-                      NNAdapterSubgraphPartitionConfigBuffer, std::string);
+                      NNAdapterSubgraphPartitionConfigBuffer,
+                      std::string);
  DECL_ARGUMENT_FIELD(nnadapter_subgraph_partition_config_path,
-                      NNAdapterSubgraphPartitionConfigPath, std::string);
-  DECL_ARGUMENT_FIELD(nnadapter_model_cache_token, NNAdapterModelCacheToken,
+                      NNAdapterSubgraphPartitionConfigPath,
+                      std::string);
+  DECL_ARGUMENT_FIELD(nnadapter_model_cache_token,
+                      NNAdapterModelCacheToken,
                      std::vector<std::string>);
-  DECL_ARGUMENT_FIELD(nnadapter_model_cache_buffer, NNAdapterModelCacheBuffer,
+  DECL_ARGUMENT_FIELD(nnadapter_model_cache_buffer,
+                      NNAdapterModelCacheBuffer,
                      std::vector<std::vector<char>>);

  // Memory optimized related.
@@ -275,13 +300,15 @@ struct Argument {
  DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);

  // The program transformed by IR analysis phase.
-  DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
+  DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program,
+                             IrAnalyzedProgram,
                             framework::proto::ProgramDesc);

  DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);

  // Only used in paddle-lite subgraph.
-  DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads,
+  DECL_ARGUMENT_FIELD(cpu_math_library_num_threads,
+                      CpuMathLibraryNumThreads,
                      int);

  // ipu related
@@ -293,7 +320,8 @@ struct Argument {
  DECL_ARGUMENT_FIELD(ipu_enable_fp16, IpuEnableFp16, bool);
  DECL_ARGUMENT_FIELD(ipu_replica_num, IpuReplicaNum, int);
  DECL_ARGUMENT_FIELD(ipu_available_memory_proportion,
-                      IpuAvailableMemoryProportion, float);
+                      IpuAvailableMemoryProportion,
+                      float);
  DECL_ARGUMENT_FIELD(ipu_enable_half_partial, IpuEnableHalfPartial, bool);

  // npu related
@@ -306,7 +334,8 @@ struct Argument {

 #define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \
  PADDLE_ENFORCE_EQ(                                  \
-      argument__->Has(#fieldname__), true,            \
+      argument__->Has(#fieldname__),                  \
+      true,                                           \
      platform::errors::PreconditionNotMet(           \
          "the argument field [%s] should be set", #fieldname__));


--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -68,11 +68,14 @@ void IRPassManager::CreatePasses(Argument *argument,
    auto precision_mode = argument->tensorrt_precision_mode();
    bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8;
    pass->Set("enable_int8", new bool(enable_int8));
-    pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
+    pass->Set("max_input_shape",
+              new std::map<std::string, std::vector<int>>(
                  argument->max_input_shape()));
-    pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
+    pass->Set("min_input_shape",
+              new std::map<std::string, std::vector<int>>(
                  argument->min_input_shape()));
-    pass->Set("optim_input_shape", new std::map<std::string, std::vector<int>>(
+    pass->Set("optim_input_shape",
+              new std::map<std::string, std::vector<int>>(
                  argument->optim_input_shape()));
    // tuned trt dynamic_shape
    pass->Set("trt_tuned_dynamic_shape",
@@ -143,14 +146,16 @@ void IRPassManager::CreatePasses(Argument *argument,
      bool int8_valid = !(model_from_memory && optim_cache_dir.empty() &&
                          enable_int8 && use_calib_mode);
      PADDLE_ENFORCE_EQ(
-          int8_valid, true,
+          int8_valid,
+          true,
          platform::errors::PreconditionNotMet(
              "When you are in TRT INT8 mode, and load model from "
              "memory, you should set optim_cache_dir using "
              "config.SetOptimCacheDir()"));
      if (model_from_memory && use_static_engine) {
        PADDLE_ENFORCE_EQ(
-            optim_cache_dir.empty(), false,
+            optim_cache_dir.empty(),
+            false,
            platform::errors::PreconditionNotMet(
                "When you are using Paddle-TRT, and using load model "
                "from memory, and also set the use_static to true. "
@@ -161,7 +166,8 @@ void IRPassManager::CreatePasses(Argument *argument,
      if (!optim_cache_dir.empty()) {
        if (!PathExists(optim_cache_dir)) {
          PADDLE_ENFORCE_NE(
-              MKDIR(optim_cache_dir.c_str()), -1,
+              MKDIR(optim_cache_dir.c_str()),
+              -1,
              platform::errors::PreconditionNotMet(
                  "Can not create optimize cache directory: %s, Make sure you "
                  "have permission to write",
@@ -187,8 +193,9 @@ void IRPassManager::CreatePasses(Argument *argument,
                new std::string(argument->tensorrt_shape_range_info_path()));
      pass->Set("trt_allow_build_at_runtime",
                new bool(argument->tensorrt_allow_build_at_runtime()));
-      pass->Set("trt_disabled_ops", new std::vector<std::string>(
-                                        argument->tensorrt_disabled_ops()));
+      pass->Set(
+          "trt_disabled_ops",
+          new std::vector<std::string>(argument->tensorrt_disabled_ops()));
      pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
      pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core()));
      // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
@@ -200,10 +207,6 @@ void IRPassManager::CreatePasses(Argument *argument,
                new int(argument->dlnne_min_subgraph_size()));
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));
-    } else if (pass_name == "mixed_precision_configure_pass") {
-      pass->Set("gpu_fp16_disabled_op_types",
-                new std::unordered_set<std::string>(
-                    argument->gpu_fp16_disabled_op_types()));
    }
    if (pass_name == "lite_subgraph_pass") {
      bool lite_enable_int8 =
@@ -272,8 +275,9 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
  if (passes_.empty()) {
    return graph;
  }
-  PADDLE_ENFORCE_NOT_NULL(graph.get(), platform::errors::PreconditionNotMet(
-                                           "Graph cannot be NULL."));
+  PADDLE_ENFORCE_NOT_NULL(
+      graph.get(),
+      platform::errors::PreconditionNotMet("Graph cannot be NULL."));
  // Apply all the passes
  for (const auto &pass : passes_) {
    if (pass->Type() != "graph_viz_pass" && !disable_logs_) {

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"

 #include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -37,7 +36,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {

  LOG(INFO) << "Sync params from CPU to NPU";

-  PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(), true,
+  PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(),
+                    true,
                    platform::errors::PreconditionNotMet(
                        "The npu_device_id field should be valid"));
  platform::Place place = platform::NPUPlace(argument->npu_device_id());
@@ -46,8 +46,9 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {

  for (auto &var_name : all_vars) {
    auto *var = scope->FindLocalVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
-                                     "The var should not be nullptr"));
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::PreconditionNotMet("The var should not be nullptr"));

    if (var->IsType<framework::LoDTensor>() ||
        var->IsType<framework::Tensor>()) {
@@ -67,26 +68,6 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {

 #else

-void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap(
-    const framework::ir::Graph &graph,
-    std::unordered_map<std::string, std::string> *var_name_op_type_map) {
-  std::vector<framework::ir::Node *> node_list =
-      framework::ir::TopologyVarientSort(
-          graph, static_cast<framework::ir::SortKind>(0));
-  for (auto *op_node : node_list) {
-    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
-        op_node->Op()->Type() == "fetch")
-      continue;
-
-    for (auto *pre_node : op_node->inputs) {
-      if (pre_node->IsVar() && pre_node->Var()->Persistable()) {
-        var_name_op_type_map->insert(std::pair<std::string, std::string>(
-            pre_node->Var()->Name(), op_node->Op()->Type()));
-      }
-    }
-  }
-}
-
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
  // The parameters are on the cpu, therefore, synchronization is not necessary.
  if (!argument->use_gpu()) return;
@@ -100,7 +81,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {

  LOG(INFO) << "Sync params from CPU to GPU";

-  PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(), true,
+  PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(),
+                    true,
                    platform::errors::PreconditionNotMet(
                        "The gpu_device_id field should be valid"));
  platform::Place place = platform::CUDAPlace(argument->gpu_device_id());
@@ -124,63 +106,44 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
  if (with_dynamic_shape) {
    reserve_cpu_weights = true;
  }
-
-  bool mixed_precision_mode =
-      argument->Has("use_gpu_fp16") && argument->use_gpu_fp16();
-  std::unordered_map<std::string, std::string> var_name_op_type_map{};
-  std::unordered_set<std::string> blacklist{};
-  if (mixed_precision_mode) {
-    GetVarNameToOpTypeMap(graph, &var_name_op_type_map);
-    blacklist = argument->gpu_fp16_disabled_op_types();
-  }
-
  for (auto &var_name : all_vars) {
-    if (std::count(repetitive_params.begin(), repetitive_params.end(),
-                   var_name)) {
+    if (std::count(
+            repetitive_params.begin(), repetitive_params.end(), var_name)) {
      if (!reserve_cpu_weights) {
        scope->EraseVars({var_name});
      }
      continue;
    }
    auto *var = scope->FindLocalVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
-                                     "The var should not be nullptr"));
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::PreconditionNotMet("The var should not be nullptr"));
    if (var->IsType<framework::LoDTensor>() ||
        var->IsType<framework::Tensor>()) {
      auto *t = var->GetMutable<framework::LoDTensor>();

-      bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 ||
-                      t->dtype() == paddle::experimental::DataType::FLOAT64;
-      if (mixed_precision_mode &&
-          !blacklist.count(var_name_op_type_map[var_name]) && is_float) {
-        framework::Tensor half_tensor;
-        half_tensor.set_type(paddle::experimental::DataType::FLOAT16);
-        half_tensor.Resize(t->dims());
-        auto *half_data =
-            half_tensor.mutable_data<float16>(platform::CPUPlace());
-        for (int i = 0; i < t->numel(); i++) {
-          auto *data = t->mutable_data<float>(platform::CPUPlace());
-          half_data[i] = static_cast<float16>(data[i]);
-        }
-        t->clear();
-        paddle::framework::TensorCopySync(half_tensor, place, t);
-      } else {
      platform::CPUPlace cpu_place;
      framework::LoDTensor temp_tensor;
      temp_tensor.Resize(t->dims());
+      temp_tensor.mutable_data<float>(cpu_place);
+
+      // Copy the parameter data to a tmp tensor.
      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
+      // Reallocation the space on GPU
      t->clear();
+
+      // Copy parameter data to newly allocated GPU space.
      paddle::framework::TensorCopySync(temp_tensor, place, t);
    }
  }
-  }
 }

 #endif

 void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
  PADDLE_ENFORCE_EQ(
-      argument->scope_valid(), true,
+      argument->scope_valid(),
+      true,
      platform::errors::PreconditionNotMet("The scope field should be valid"));

 #ifdef PADDLE_WITH_ASCEND_CL

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -38,12 +38,7 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
 #ifdef PADDLE_WITH_ASCEND_CL
  void CopyParamsToNpu(Argument *argument);
 #else
-
-  void GetVarNameToOpTypeMap(
-      const framework::ir::Graph& graph,
-      std::unordered_map<std::string, std::string>* var_name_op_type_map);
-
-  void CopyParamsToGpu(Argument* argument);
+  void CopyParamsToGpu(Argument *argument);
 #endif
 };


--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -84,7 +84,6 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,

  Update();
 }
-
 void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                  int device_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -101,16 +100,18 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
 }

 void AnalysisConfig::SetExecStream(void *stream) {
-  PADDLE_ENFORCE_NOT_NULL(stream, platform::errors::InvalidArgument(
-                                      "`stream` should not be nullptr"));
+  PADDLE_ENFORCE_NOT_NULL(
+      stream,
+      platform::errors::InvalidArgument("`stream` should not be nullptr"));
  exec_stream_ = stream;
  use_external_stream_ = true;
  Update();
 }

 void *AnalysisConfig::GetExecStream() const {
-  PADDLE_ENFORCE_NOT_NULL(exec_stream_, platform::errors::InvalidArgument(
-                                            "`stream` should not be nullptr"));
+  PADDLE_ENFORCE_NOT_NULL(
+      exec_stream_,
+      platform::errors::InvalidArgument("`stream` should not be nullptr"));
  return exec_stream_;
 }

@@ -124,27 +125,16 @@ void AnalysisConfig::DisableGpu() {
  Update();
 }

-void AnalysisConfig::Exp_EnableUseGpuFp16(
-    std::unordered_set<std::string> op_list) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  use_gpu_fp16_ = true;
-  gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end());
-#else
-  LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()";
-  use_gpu_fp16_ = false;
-#endif
-
-  Update();
-}
-
 void AnalysisConfig::DisableFCPadding() {
  use_fc_padding_ = false;

  Update();
 }

-void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked,
-                               bool autotune, const std::string &autotune_file,
+void AnalysisConfig::EnableXpu(int l3_workspace_size,
+                               bool locked,
+                               bool autotune,
+                               const std::string &autotune_file,
                               const std::string &precision,
                               bool adaptive_seqlen) {
  use_xpu_ = true;
@@ -158,7 +148,8 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked,
 }

 void AnalysisConfig::SetXpuDeviceId(int device_id) {
-  PADDLE_ENFORCE_EQ(use_xpu_, true,
+  PADDLE_ENFORCE_EQ(use_xpu_,
+                    true,
                    platform::errors::PreconditionNotMet(
                        "Should call EnableXpu before SetXpuDeviceId."));
  xpu_device_id_ = device_id;
@@ -190,7 +181,8 @@ void AnalysisConfig::EnableCustomDevice(const std::string &device_type,
  Update();
 }

-void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size,
+void AnalysisConfig::EnableIpu(int ipu_device_num,
+                               int ipu_micro_batch_size,
                               bool ipu_enable_pipelining,
                               int ipu_batches_per_step) {
  enable_ir_optim_ = true;
@@ -204,7 +196,8 @@ void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size,
  Update();
 }

-void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num,
+void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16,
+                                  int ipu_replica_num,
                                  float ipu_available_memory_proportion,
                                  bool ipu_enable_half_partial) {
  ipu_enable_fp16_ = ipu_enable_fp16;
@@ -262,8 +255,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(use_cudnn_);
  CP_MEMBER(gpu_device_id_);
  CP_MEMBER(memory_pool_init_size_mb_);
-  CP_MEMBER(use_gpu_fp16_);
-  CP_MEMBER(gpu_fp16_disabled_op_types_);

  CP_MEMBER(enable_memory_optim_);
  // TensorRT related.
@@ -366,7 +357,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(custom_device_id_);

  if (use_gpu_) {
-    PADDLE_ENFORCE_EQ(use_xpu_, false,
+    PADDLE_ENFORCE_EQ(use_xpu_,
+                      false,
                      platform::errors::InvalidArgument(
                          "Only one choice can be made between CPU and XPU."));
    pass_builder_.reset(new GpuPassStrategy(
@@ -406,8 +398,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
    std::sort(all_passes.begin(), all_passes.end());
    std::sort(other_passes.begin(), other_passes.end());
    std::vector<std::string> deleted_passes;
-    std::set_difference(all_passes.begin(), all_passes.end(),
-                        other_passes.begin(), other_passes.end(),
+    std::set_difference(all_passes.begin(),
+                        all_passes.end(),
+                        other_passes.begin(),
+                        other_passes.end(),
                        std::inserter(deleted_passes, deleted_passes.begin()));
    for (auto ps : deleted_passes) {
      pass_builder_->DeletePass(ps);
@@ -516,8 +510,11 @@ MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
 }

 void AnalysisConfig::EnableTensorRtEngine(
-    int workspace_size, int max_batch_size, int min_subgraph_size,
-    AnalysisConfig::Precision precision_mode, bool use_static,
+    int workspace_size,
+    int max_batch_size,
+    int min_subgraph_size,
+    AnalysisConfig::Precision precision_mode,
+    bool use_static,
    bool use_calib_mode) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (!use_gpu()) {
@@ -594,19 +591,22 @@ void AnalysisConfig::Update() {
      pass_builder_.reset(new IpuPassStrategy);
    } else if (use_xpu()) {
      PADDLE_ENFORCE_EQ(
-          use_gpu(), false,
+          use_gpu(),
+          false,
          platform::errors::InvalidArgument(
              "Only one choice can be made between CPU and XPU."));
      pass_builder_.reset(new XpuPassStrategy);
    } else if (use_npu()) {
      PADDLE_ENFORCE_EQ(
-          use_gpu(), false,
+          use_gpu(),
+          false,
          platform::errors::InvalidArgument(
              "Only one choice can be made between GPU and NPU."));
      pass_builder_.reset(new NpuPassStrategy);
    } else if (use_custom_device()) {
      PADDLE_ENFORCE_EQ(
-          use_gpu(), false,
+          use_gpu(),
+          false,
          platform::errors::InvalidArgument(
              "Only one choice can be made between GPU and CustomDevice."));
      pass_builder_.reset(new CustomDevicePassStrategy);
@@ -624,21 +624,24 @@ void AnalysisConfig::Update() {
          *static_cast<IpuPassStrategy *>(pass_builder_.get())));
    } else if (use_xpu()) {
      PADDLE_ENFORCE_EQ(
-          use_gpu(), false,
+          use_gpu(),
+          false,
          platform::errors::InvalidArgument(
              "Only one choice can be made between CPU and XPU."));
      pass_builder_.reset(new XpuPassStrategy(
          *static_cast<XpuPassStrategy *>(pass_builder_.get())));
    } else if (use_npu()) {
      PADDLE_ENFORCE_EQ(
-          use_gpu(), false,
+          use_gpu(),
+          false,
          platform::errors::InvalidArgument(
              "Only one choice can be made between GPU and NPU."));
      pass_builder_.reset(new NpuPassStrategy(
          *static_cast<NpuPassStrategy *>(pass_builder_.get())));
    } else if (use_custom_device()) {
      PADDLE_ENFORCE_EQ(
-          use_gpu(), false,
+          use_gpu(),
+          false,
          platform::errors::InvalidArgument(
              "Only one choice can be made between GPU and CustomDevice."));
      pass_builder_.reset(new CustomDevicePassStrategy(
@@ -677,20 +680,6 @@ void AnalysisConfig::Update() {
 #endif
  }

-  if (use_gpu_fp16_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    if (!enable_ir_optim_) {
-      LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is "
-                    "enabled.";
-    } else if (!use_gpu()) {
-      LOG(ERROR)
-          << "Exp_EnableUseGpuFp16() only works when use_gpu is enabled.";
-    } else {
-      pass_builder()->Exp_EnableUseGpuFp16();
-    }
-#endif
-  }
-
  if (use_mkldnn_) {
 #ifdef PADDLE_WITH_MKLDNN
    if (!enable_ir_optim_) {
@@ -749,7 +738,8 @@ void AnalysisConfig::Update() {
 #endif
    pass_builder()->ClearPasses();
    for (const auto &pass : kLiteSubgraphPasses) {
-      if (std::find(lite_passes_filter_.begin(), lite_passes_filter_.end(),
+      if (std::find(lite_passes_filter_.begin(),
+                    lite_passes_filter_.end(),
                    pass) == lite_passes_filter_.end()) {
        pass_builder()->AppendPass(pass);
      }
@@ -758,7 +748,8 @@ void AnalysisConfig::Update() {

  if (use_xpu_) {
 #if (defined LITE_SUBGRAPH_WITH_XPU) || (defined PADDLE_WITH_XPU)
-    PADDLE_ENFORCE_EQ(use_gpu_, false,
+    PADDLE_ENFORCE_EQ(use_gpu_,
+                      false,
                      platform::errors::Unavailable(
                          "Currently, XPU and GPU cannot be enabled in the "
                          "same analysis configuration."));
@@ -771,7 +762,8 @@ void AnalysisConfig::Update() {

  if (use_npu_) {
 #if defined(PADDLE_WITH_ASCEND_CL) || defined(LITE_SUBGRAPH_WITH_NPU)
-    PADDLE_ENFORCE_EQ(use_gpu_, false,
+    PADDLE_ENFORCE_EQ(use_gpu_,
+                      false,
                      platform::errors::Unavailable(
                          "Currently, NPU and GPU cannot be enabled in the "
                          "same analysis configuration."));
@@ -809,8 +801,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << use_gpu_;
  ss << use_external_stream_;
  ss << exec_stream_;
-  ss << use_gpu_fp16_;
-  for (auto &item : gpu_fp16_disabled_op_types_) ss << item;
  ss << use_fc_padding_;
  ss << gpu_device_id_;
  ss << xpu_device_id_;
@@ -957,7 +947,8 @@ void AnalysisConfig::DisableGlogInfo() {
 }

 void AnalysisConfig::EnableLiteEngine(
-    AnalysisConfig::Precision precision_mode, bool zero_copy,
+    AnalysisConfig::Precision precision_mode,
+    bool zero_copy,
    const std::vector<std::string> &passes_filter,
    const std::vector<std::string> &ops_filter) {
  use_lite_ = true;
@@ -1057,9 +1048,9 @@ std::string AnalysisConfig::Summary() {
      // dynamic_shape
      os.InsertRow({"tensorrt_enable_dynamic_shape",
                    min_input_shape_.empty() ? "false" : "true"});
-      os.InsertRow({"tensorrt_tuned_dynamic_shape", trt_tuned_dynamic_shape_
-                                                        ? shape_range_info_path_
-                                                        : "false"});
+      os.InsertRow(
+          {"tensorrt_tuned_dynamic_shape",
+           trt_tuned_dynamic_shape_ ? shape_range_info_path_ : "false"});

      os.InsertRow(
          {"tensorrt_use_varseqlen", trt_use_varseqlen_ ? "true" : "false"});
@@ -1123,10 +1114,12 @@ LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheDir(
 LiteNNAdapterConfig &LiteNNAdapterConfig::SetModelCacheBuffers(
    const std::string &model_cache_token,
    const std::vector<char> &model_cache_buffer) {
-  PADDLE_ENFORCE_EQ(model_cache_token.empty(), false,
+  PADDLE_ENFORCE_EQ(model_cache_token.empty(),
+                    false,
                    platform::errors::InvalidArgument(
                        "model_cache_token should not be empty."));
-  PADDLE_ENFORCE_EQ(model_cache_buffer.empty(), false,
+  PADDLE_ENFORCE_EQ(model_cache_buffer.empty(),
+                    false,
                    platform::errors::InvalidArgument(
                        "model_cache_buffer should not be empty."));
  PADDLE_ENFORCE_EQ(nnadapter_model_cache_buffers.count(model_cache_token),
@@ -1165,7 +1158,8 @@ void AnalysisConfig::CollectShapeRangeInfo(
            << "all intermediate tensors in the compute graph and calculate "
               "the min_shape, max_shape and opt_shape.";
  collect_shape_range_info_ = true;
-  PADDLE_ENFORCE_EQ(shape_range_info_path.empty(), false,
+  PADDLE_ENFORCE_EQ(shape_range_info_path.empty(),
+                    false,
                    platform::errors::InvalidArgument(
                        "The shape_range_info_path should not be empty, please "
                        "re-check the argument."));

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1048,11 +1048,6 @@ void AnalysisPredictor::PrepareArgument() {
    argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
  }

-  if (config_.gpu_fp16_enabled()) {
-    argument_.SetUseGPUFp16(true);
-    argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_);
-  }
-
  if (config_.lite_engine_enabled()) {
    argument_.SetCpuMathLibraryNumThreads(
        config_.cpu_math_library_num_threads());

--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -371,19 +371,6 @@ TEST(AnalysisPredictor, enable_onnxruntime) {
  ASSERT_TRUE(!config.use_onnxruntime());
 }

-TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) {
-  AnalysisConfig config;
-  config.SwitchIrOptim();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  config.EnableUseGpu(100, 0);
-  config.Exp_EnableUseGpuFp16();
-  ASSERT_TRUE(config.gpu_fp16_enabled());
-#else
-  config.DisableGpu();
-#endif
-  LOG(INFO) << config.Summary();
-}
-
 }  // namespace paddle

 namespace paddle_infer {
@@ -443,19 +430,6 @@ TEST(Predictor, EnableONNXRuntime) {
  auto predictor = CreatePredictor(config);
 }

-TEST(Predictor, Exp_EnableUseGpuFp16) {
-  Config config;
-  config.SetModel(FLAGS_dirname);
-  config.SwitchIrOptim();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  config.EnableUseGpu(100, 0);
-  config.Exp_EnableUseGpuFp16();
-#else
-  config.DisableGpu();
-#endif
-  auto predictor = CreatePredictor(config);
-}
-
 TEST(Tensor, CpuShareExternalData) {
  Config config;
  config.SetModel(FLAGS_dirname);
@@ -476,8 +450,8 @@ TEST(Tensor, CpuShareExternalData) {
  auto out = predictor->GetOutputHandle("fc_1.tmp_2");
  auto out_shape = out->shape();
  std::vector<float> out_data;
-  out_data.resize(std::accumulate(out_shape.begin(), out_shape.end(), 1,
-                                  std::multiplies<int>()));
+  out_data.resize(std::accumulate(
+      out_shape.begin(), out_shape.end(), 1, std::multiplies<int>()));
  out->ShareExternalData<float>(out_data.data(), out_shape, PlaceType::kCPU);

  predictor->Run();
@@ -507,7 +481,9 @@ TEST(Tensor, GpuShareExternalData) {

  for (size_t i = 0; i < 4; ++i) {
    cudaMalloc(reinterpret_cast<void**>(&input_gpu[i]), 4 * sizeof(int64_t));
-    cudaMemcpy(input_gpu[i], input_data[i].data(), 4 * sizeof(int64_t),
+    cudaMemcpy(input_gpu[i],
+               input_data[i].data(),
+               4 * sizeof(int64_t),
               cudaMemcpyHostToDevice);
  }

@@ -519,8 +495,9 @@ TEST(Tensor, GpuShareExternalData) {
  auto out = predictor->GetOutputHandle("fc_1.tmp_2");
  auto out_shape = out->shape();
  float* out_data = nullptr;
-  auto out_size = std::accumulate(out_shape.begin(), out_shape.end(), 1,
-                                  std::multiplies<int>()) *
+  auto out_size =
+      std::accumulate(
+          out_shape.begin(), out_shape.end(), 1, std::multiplies<int>()) *
      sizeof(float);
  cudaMalloc(reinterpret_cast<void**>(out_data), out_size * sizeof(float));
  out->ShareExternalData<float>(out_data, out_shape, PlaceType::kGPU);

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -253,19 +253,6 @@ struct PD_INFER_DECL AnalysisConfig {
  ///
  ///
  void DisableGpu();
-  ///
-  /// \brief Enable GPU fp16 precision computation, in experimental state.
-  ///
-  /// \param op_list The operator type list.
-  ///
-  void Exp_EnableUseGpuFp16(std::unordered_set<std::string> op_list = {});
-  ///
-  /// \brief A boolean state telling whether the GPU fp16 precision is turned
-  /// on.
-  ///
-  /// \return bool Whether the GPU fp16 precision is turned on.
-  ///
-  bool gpu_fp16_enabled() const { return use_gpu_fp16_; }

  ///
  /// \brief Turn on XPU.
@@ -287,8 +274,10 @@ struct PD_INFER_DECL AnalysisConfig {
  /// \param precision Calculation accuracy of multi_encoder
  /// \param adaptive_seqlen Is the input of multi_encoder variable length
  ///
-  void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false,
-                 bool autotune = true, const std::string& autotune_file = "",
+  void EnableXpu(int l3_workspace_size = 0xfffc00,
+                 bool locked = false,
+                 bool autotune = true,
+                 const std::string& autotune_file = "",
                 const std::string& precision = "int16",
                 bool adaptive_seqlen = false);

@@ -301,7 +290,8 @@ struct PD_INFER_DECL AnalysisConfig {
  /// \param ipu_enable_pipelining enable pipelining.
  /// \param ipu_batches_per_step the number of batches per run in pipelining.
  ///
-  void EnableIpu(int ipu_device_num = 1, int ipu_micro_batch_size = 1,
+  void EnableIpu(int ipu_device_num = 1,
+                 int ipu_micro_batch_size = 1,
                 bool ipu_enable_pipelining = false,
                 int ipu_batches_per_step = 1);

@@ -315,7 +305,8 @@ struct PD_INFER_DECL AnalysisConfig {
  /// \param ipu_enable_half_partial enable fp16 partial for matmul, only work
  /// with fp16.
  ///
-  void SetIpuConfig(bool ipu_enable_fp16 = false, int ipu_replica_num = 1,
+  void SetIpuConfig(bool ipu_enable_fp16 = false,
+                    int ipu_replica_num = 1,
                    float ipu_available_memory_proportion = 1.0,
                    bool ipu_enable_half_partial = false);

@@ -525,7 +516,8 @@ struct PD_INFER_DECL AnalysisConfig {
  ///
  ///
  void EnableTensorRtEngine(int workspace_size = 1 << 20,
-                            int max_batch_size = 1, int min_subgraph_size = 3,
+                            int max_batch_size = 1,
+                            int min_subgraph_size = 3,
                            Precision precision = Precision::kFloat32,
                            bool use_static = false,
                            bool use_calib_mode = true);
@@ -821,8 +813,10 @@ struct PD_INFER_DECL AnalysisConfig {
  /// \param params_buffer The memory buffer of the combined parameters file.
  /// \param params_buffer_size The size of the combined parameters data.
  ///
-  void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
-                      const char* params_buffer, size_t params_buffer_size);
+  void SetModelBuffer(const char* prog_buffer,
+                      size_t prog_buffer_size,
+                      const char* params_buffer,
+                      size_t params_buffer_size);
  ///
  /// \brief A boolean state telling whether the model is set from the CPU
  /// memory.
@@ -929,20 +923,6 @@ struct PD_INFER_DECL AnalysisConfig {
  int gpu_device_id_{0};
  uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
  bool thread_local_stream_{false};
-  bool use_gpu_fp16_{false};
-  std::unordered_set<std::string> gpu_fp16_disabled_op_types_{
-      "conv2d_fusion",
-      "conv2d",
-      "roll",
-      "strided_slice",
-      "depthwise_conv2d",
-      "unfold",
-      "generate_proposals_v2",
-      "nearest_interp_v2",
-      "bilinear_interp_v2"
-      "yolo_box",
-      "multiclass_nms3",
-      "matrix_nms"};

  bool use_cudnn_{false};
  bool use_external_stream_{false};

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -194,40 +194,6 @@ void GpuPassStrategy::EnableCUDNN() {
  use_cudnn_ = true;
 }

-void GpuPassStrategy::Exp_EnableUseGpuFp16() {
-  passes_.assign({
-    "is_test_pass",                               //
-        "simplify_with_basic_ops_pass",           //
-        "conv_bn_fuse_pass",                      //
-        "conv_eltwiseadd_bn_fuse_pass",           //
-        "embedding_eltwise_layernorm_fuse_pass",  //
-        "multihead_matmul_fuse_pass_v2",          //
-        "gpu_cpu_squeeze2_matmul_fuse_pass",      //
-        "gpu_cpu_reshape2_matmul_fuse_pass",      //
-        "gpu_cpu_flatten2_matmul_fuse_pass",      //
-        "gpu_cpu_map_matmul_v2_to_mul_pass",      //
-        "gpu_cpu_map_matmul_v2_to_matmul_pass",   //
-        "gpu_cpu_map_matmul_to_mul_pass",         //
-        // "fc_fuse_pass",                        //
-        "fc_elementwise_layernorm_fuse_pass",  //
-#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
-                           // guaranteed at least v7
-// cudnn8.0 has memory leak problem in conv + eltwise + act, so we
-// disable the pass.
-#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100)
-        "conv_elementwise_add_act_fuse_pass",   //
-        "conv_elementwise_add2_act_fuse_pass",  //
-#endif
-        "conv_elementwise_add_fuse_pass",      //
-#endif                                         //
-        "transpose_flatten_concat_fuse_pass",  //
-        "mixed_precision_configure_pass",      //
-        "runtime_context_cache_pass"           //
-  });
-
-  use_gpu_fp16_ = true;
-}
-
 void GpuPassStrategy::EnableMKLDNN() {
  LOG(ERROR) << "GPU not support MKLDNN yet";
 }

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -109,8 +109,11 @@ class PD_INFER_DECL PaddlePassBuilder {
 protected:
  /// \cond Protected
  std::vector<std::string> analysis_passes_{
-      {"ir_graph_build_pass", "ir_graph_clean_pass", "ir_analysis_pass",
-       "ir_params_sync_among_devices_pass", "adjust_cudnn_workspace_size_pass",
+      {"ir_graph_build_pass",
+       "ir_graph_clean_pass",
+       "ir_analysis_pass",
+       "ir_params_sync_among_devices_pass",
+       "adjust_cudnn_workspace_size_pass",
       "inference_op_replace_pass"}};
  std::vector<std::string> passes_;
  /// \endcond
@@ -129,9 +132,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
  /// \brief Enable the use of cuDNN kernel.
  virtual void EnableCUDNN() {}

-  /// \brief Enable use gpu fp16 kernel.
-  virtual void Exp_EnableUseGpuFp16() {}
-
  /// \brief Enable the use of MKLDNN.
  /// The MKLDNN control exists in both CPU and GPU mode, because there can
  /// still be some CPU kernels running in GPU mode.
@@ -150,10 +150,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
  /// \return A bool variable implying whether we are in gpu mode.
  bool use_gpu() const { return use_gpu_; }

-  /// \brief Check if we are using gpu fp16 kernel.
-  /// \return A bool variable implying whether we are in gpu fp16 mode.
-  bool use_gpu_fp16() const { return use_gpu_fp16_; }
-
  /// \brief Check if we are using xpu.
  /// \return A bool variable implying whether we are in xpu mode.
  bool use_xpu() const { return use_xpu_; }
@@ -180,7 +176,6 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
  bool use_npu_{false};
  bool use_ipu_{false};
  bool use_mkldnn_{false};
-  bool use_gpu_fp16_{false};
  bool use_custom_device_{false};
  /// \endcond
 };
@@ -248,9 +243,6 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
  /// \brief Enable the use of cuDNN kernel.
  void EnableCUDNN() override;

-  /// \brief Enable the use of gpu fp16 kernel.
-  void Exp_EnableUseGpuFp16() override;
-
  /// \brief Not supported in GPU mode yet.
  void EnableMKLDNN() override;

@@ -269,7 +261,6 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
 protected:
  /// \cond Protected
  bool use_cudnn_{false};
-  bool use_gpu_fp16_{false};
  /// \endcond
 };


--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -113,7 +113,8 @@ template <typename T>
 PaddleBuf PaddleBufCreate(
    py::array_t<T, py::array::c_style | py::array::forcecast> data) {
  PaddleBuf buf(data.size() * sizeof(T));
-  std::copy_n(static_cast<const T *>(data.data()), data.size(),
+  std::copy_n(static_cast<const T *>(data.data()),
+              data.size(),
              static_cast<T *>(buf.data()));
  return buf;
 }
@@ -123,7 +124,8 @@ void PaddleBufReset(
    PaddleBuf &buf,                                                    // NOLINT
    py::array_t<T, py::array::c_style | py::array::forcecast> data) {  // NOLINT
  buf.Resize(data.size() * sizeof(T));
-  std::copy_n(static_cast<const T *>(data.data()), data.size(),
+  std::copy_n(static_cast<const T *>(data.data()),
+              data.size(),
              static_cast<T *>(buf.data()));
 }

@@ -131,12 +133,14 @@ template <typename T>
 PaddleTensor PaddleTensorCreate(
    py::array_t<T, py::array::c_style | py::array::forcecast> data,
    const std::string name = "",
-    const std::vector<std::vector<size_t>> &lod = {}, bool copy = true) {
+    const std::vector<std::vector<size_t>> &lod = {},
+    bool copy = true) {
  PaddleTensor tensor;

  if (copy) {
    PaddleBuf buf(data.size() * sizeof(T));
-    std::copy_n(static_cast<const T *>(data.data()), data.size(),
+    std::copy_n(static_cast<const T *>(data.data()),
+                data.size(),
                static_cast<T *>(buf.data()));
    tensor.data = std::move(buf);
  } else {
@@ -235,11 +239,13 @@ void PaddleInferShareExternalData(paddle_infer::Tensor &tensor,  // NOLINT
  }
  if (input_tensor.dtype() == phi::DataType::FLOAT32) {
    tensor.ShareExternalData(
-        static_cast<float *>(input_tensor.data()), shape,
+        static_cast<float *>(input_tensor.data()),
+        shape,
        ToPaddleInferPlace(input_tensor.place().GetType()));
  } else if (input_tensor.dtype() == phi::DataType::FLOAT16) {
    tensor.ShareExternalData(
-        static_cast<paddle::platform::float16 *>(input_tensor.data()), shape,
+        static_cast<paddle::platform::float16 *>(input_tensor.data()),
+        shape,
        ToPaddleInferPlace(input_tensor.place().GetType()));
  }
 }
@@ -379,9 +385,11 @@ void BindInferenceApi(py::module *m) {
  BindMkldnnQuantizerConfig(m);
 #endif
  m->def("create_paddle_predictor",
-         &paddle::CreatePaddlePredictor<AnalysisConfig>, py::arg("config"));
+         &paddle::CreatePaddlePredictor<AnalysisConfig>,
+         py::arg("config"));
  m->def("create_paddle_predictor",
-         &paddle::CreatePaddlePredictor<NativeConfig>, py::arg("config"));
+         &paddle::CreatePaddlePredictor<NativeConfig>,
+         py::arg("config"));
  m->def("create_predictor",
         [](const paddle_infer::Config &config)
             -> std::unique_ptr<paddle_infer::Predictor> {
@@ -478,15 +486,18 @@ void BindPaddleBuf(py::module *m) {
 void BindPaddleTensor(py::module *m) {
  py::class_<PaddleTensor>(*m, "PaddleTensor")
      .def(py::init<>())
-      .def(py::init(&PaddleTensorCreate<int32_t>), py::arg("data"),
+      .def(py::init(&PaddleTensorCreate<int32_t>),
+           py::arg("data"),
           py::arg("name") = "",
           py::arg("lod") = std::vector<std::vector<size_t>>(),
           py::arg("copy") = true)
-      .def(py::init(&PaddleTensorCreate<int64_t>), py::arg("data"),
+      .def(py::init(&PaddleTensorCreate<int64_t>),
+           py::arg("data"),
           py::arg("name") = "",
           py::arg("lod") = std::vector<std::vector<size_t>>(),
           py::arg("copy") = true)
-      .def(py::init(&PaddleTensorCreate<float>), py::arg("data"),
+      .def(py::init(&PaddleTensorCreate<float>),
+           py::arg("data"),
           py::arg("name") = "",
           py::arg("lod") = std::vector<std::vector<size_t>>(),
           py::arg("copy") = true)
@@ -563,7 +574,8 @@ void BindNativePredictor(py::module *m) {
      .def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor)
      .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
      .def("clone", &NativePaddlePredictor::Clone)
-      .def("scope", &NativePaddlePredictor::scope,
+      .def("scope",
+           &NativePaddlePredictor::scope,
           py::return_value_policy::reference);
 }

@@ -581,7 +593,8 @@ void BindAnalysisConfig(py::module *m) {
      .def(py::init<const std::string &>())
      .def(py::init<const std::string &, const std::string &>())
      .def("summary", &AnalysisConfig::Summary)
-      .def("set_model", (void(AnalysisConfig::*)(const std::string &)) &
+      .def("set_model",
+           (void(AnalysisConfig::*)(const std::string &)) &
               AnalysisConfig::SetModel)
      .def("set_model",
           (void(AnalysisConfig::*)(const std::string &, const std::string &)) &
@@ -591,25 +604,32 @@ void BindAnalysisConfig(py::module *m) {
      .def("model_dir", &AnalysisConfig::model_dir)
      .def("prog_file", &AnalysisConfig::prog_file)
      .def("params_file", &AnalysisConfig::params_file)
-      .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
-           py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
-      .def("exp_enable_use_gpu_fp16", &AnalysisConfig::Exp_EnableUseGpuFp16,
-           py::arg("gpu_fp16_disabled_op_types") =
-               std::unordered_set<std::string>({}))
-      .def("enable_xpu", &AnalysisConfig::EnableXpu,
+      .def("enable_use_gpu",
+           &AnalysisConfig::EnableUseGpu,
+           py::arg("memory_pool_init_size_mb"),
+           py::arg("device_id") = 0)
+      .def("enable_xpu",
+           &AnalysisConfig::EnableXpu,
           py::arg("l3_workspace_size") = 16 * 1024 * 1024,
-           py::arg("locked") = false, py::arg("autotune") = true,
-           py::arg("autotune_file") = "", py::arg("precision") = "int16",
+           py::arg("locked") = false,
+           py::arg("autotune") = true,
+           py::arg("autotune_file") = "",
+           py::arg("precision") = "int16",
           py::arg("adaptive_seqlen") = false)
-      .def("set_xpu_device_id", &AnalysisConfig::SetXpuDeviceId,
+      .def("set_xpu_device_id",
+           &AnalysisConfig::SetXpuDeviceId,
           py::arg("device_id") = 0)
      .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
-      .def("enable_ipu", &AnalysisConfig::EnableIpu,
-           py::arg("ipu_device_num") = 1, py::arg("ipu_micro_batch_size") = 1,
+      .def("enable_ipu",
+           &AnalysisConfig::EnableIpu,
+           py::arg("ipu_device_num") = 1,
+           py::arg("ipu_micro_batch_size") = 1,
           py::arg("ipu_enable_pipelining") = false,
           py::arg("ipu_batches_per_step") = 1)
-      .def("set_ipu_config", &AnalysisConfig::SetIpuConfig,
-           py::arg("ipu_enable_fp16") = false, py::arg("ipu_replica_num") = 1,
+      .def("set_ipu_config",
+           &AnalysisConfig::SetIpuConfig,
+           py::arg("ipu_enable_fp16") = false,
+           py::arg("ipu_replica_num") = 1,
           py::arg("ipu_available_memory_proportion") = 1.0,
           py::arg("ipu_enable_half_partial") = false)
      .def("disable_gpu", &AnalysisConfig::DisableGpu)
@@ -627,27 +647,34 @@ void BindAnalysisConfig(py::module *m) {
           &AnalysisConfig::memory_pool_init_size_mb)
      .def("fraction_of_gpu_memory_for_pool",
           &AnalysisConfig::fraction_of_gpu_memory_for_pool)
-      .def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim,
+      .def("switch_ir_optim",
+           &AnalysisConfig::SwitchIrOptim,
           py::arg("x") = true)
      .def("ir_optim", &AnalysisConfig::ir_optim)
-      .def("enable_memory_optim", &AnalysisConfig::EnableMemoryOptim,
+      .def("enable_memory_optim",
+           &AnalysisConfig::EnableMemoryOptim,
           py::arg("x") = true)
      .def("enable_profile", &AnalysisConfig::EnableProfile)
      .def("disable_glog_info", &AnalysisConfig::DisableGlogInfo)
      .def("glog_info_disabled", &AnalysisConfig::glog_info_disabled)
      .def("set_optim_cache_dir", &AnalysisConfig::SetOptimCacheDir)
-      .def("switch_use_feed_fetch_ops", &AnalysisConfig::SwitchUseFeedFetchOps,
+      .def("switch_use_feed_fetch_ops",
+           &AnalysisConfig::SwitchUseFeedFetchOps,
           py::arg("x") = true)
      .def("use_feed_fetch_ops_enabled",
           &AnalysisConfig::use_feed_fetch_ops_enabled)
      .def("switch_specify_input_names",
-           &AnalysisConfig::SwitchSpecifyInputNames, py::arg("x") = true)
+           &AnalysisConfig::SwitchSpecifyInputNames,
+           py::arg("x") = true)
      .def("specify_input_name", &AnalysisConfig::specify_input_name)
-      .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine,
-           py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
+      .def("enable_tensorrt_engine",
+           &AnalysisConfig::EnableTensorRtEngine,
+           py::arg("workspace_size") = 1 << 20,
+           py::arg("max_batch_size") = 1,
           py::arg("min_subgraph_size") = 3,
           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
-           py::arg("use_static") = false, py::arg("use_calib_mode") = true)
+           py::arg("use_static") = false,
+           py::arg("use_calib_mode") = true)
      .def("tensorrt_precision_mode", &AnalysisConfig::tensorrt_precision_mode)
      .def("set_trt_dynamic_shape_info",
           &AnalysisConfig::SetTRTDynamicShapeInfo,
@@ -674,7 +701,8 @@ void BindAnalysisConfig(py::module *m) {
      .def("trt_allow_build_at_runtime",
           &AnalysisConfig::trt_allow_build_at_runtime)
      .def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs)
-      .def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA,
+      .def("enable_tensorrt_dla",
+           &AnalysisConfig::EnableTensorRtDLA,
           py::arg("dla_core") = 0)
      .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)
      .def("enable_tensorrt_inspector",
@@ -682,15 +710,18 @@ void BindAnalysisConfig(py::module *m) {
      .def("tensorrt_inspector_enabled",
           &AnalysisConfig::tensorrt_inspector_enabled)
      .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
-      .def("enable_dlnne", &AnalysisConfig::EnableDlnne,
+      .def("enable_dlnne",
+           &AnalysisConfig::EnableDlnne,
           py::arg("min_subgraph_size") = 3)
-      .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
+      .def("enable_lite_engine",
+           &AnalysisConfig::EnableLiteEngine,
           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
           py::arg("zero_copy") = false,
           py::arg("passes_filter") = std::vector<std::string>(),
           py::arg("ops_filter") = std::vector<std::string>())
      .def("lite_engine_enabled", &AnalysisConfig::lite_engine_enabled)
-      .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,
+      .def("switch_ir_debug",
+           &AnalysisConfig::SwitchIrDebug,
           py::arg("x") = true)
      .def("enable_mkldnn", &AnalysisConfig::EnableMKLDNN)
      .def("mkldnn_enabled", &AnalysisConfig::mkldnn_enabled)
@@ -702,12 +733,15 @@ void BindAnalysisConfig(py::module *m) {
      .def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer)
      .def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableMkldnnBfloat16)
 #ifdef PADDLE_WITH_MKLDNN
-      .def("quantizer_config", &AnalysisConfig::mkldnn_quantizer_config,
+      .def("quantizer_config",
+           &AnalysisConfig::mkldnn_quantizer_config,
           py::return_value_policy::reference)
-      .def("set_mkldnn_cache_capacity", &AnalysisConfig::SetMkldnnCacheCapacity,
+      .def("set_mkldnn_cache_capacity",
+           &AnalysisConfig::SetMkldnnCacheCapacity,
           py::arg("capacity") = 0)
      .def("set_bfloat16_op", &AnalysisConfig::SetBfloat16Op)
-      .def("enable_mkldnn_int8", &AnalysisConfig::EnableMkldnnInt8,
+      .def("enable_mkldnn_int8",
+           &AnalysisConfig::EnableMkldnnInt8,
           py::arg("mkldnn_int8_enabled_op_types") =
               std::unordered_set<std::string>({}))
      .def("mkldnn_int8_enabled", &AnalysisConfig::mkldnn_int8_enabled)
@@ -807,17 +841,20 @@ void BindAnalysisPredictor(py::module *m) {
      .def("prepare_argument", &AnalysisPredictor::PrepareArgument)
      .def("optimize_inference_program",
           &AnalysisPredictor::OptimizeInferenceProgram)
-      .def("analysis_argument", &AnalysisPredictor::analysis_argument,
+      .def("analysis_argument",
+           &AnalysisPredictor::analysis_argument,
           py::return_value_policy::reference)
      .def("clone", &AnalysisPredictor::Clone)
-      .def("scope", &AnalysisPredictor::scope,
+      .def("scope",
+           &AnalysisPredictor::scope,
           py::return_value_policy::reference)
-      .def("program", &AnalysisPredictor::program,
+      .def("program",
+           &AnalysisPredictor::program,
           py::return_value_policy::reference)
      .def("get_serialized_program", &AnalysisPredictor::GetSerializedProgram)
      .def("mkldnn_quantize", &AnalysisPredictor::MkldnnQuantize)
-      .def("SaveOptimModel", &AnalysisPredictor::SaveOptimModel,
-           py::arg("dir"));
+      .def(
+          "SaveOptimModel", &AnalysisPredictor::SaveOptimModel, py::arg("dir"));
 }

 void BindPaddleInferPredictor(py::module *m) {
@@ -842,9 +879,11 @@ void BindPaddleInferPredictor(py::module *m) {

 void BindZeroCopyTensor(py::module *m) {
  py::class_<ZeroCopyTensor>(*m, "ZeroCopyTensor")
-      .def("reshape", py::overload_cast<const std::vector<int> &>(
-                          &ZeroCopyTensor::Reshape))
-      .def("reshape", py::overload_cast<const std::size_t &>(
+      .def(
+          "reshape",
+          py::overload_cast<const std::vector<int> &>(&ZeroCopyTensor::Reshape))
+      .def("reshape",
+           py::overload_cast<const std::size_t &>(
               &paddle_infer::Tensor::ReshapeStrings))
      .def("copy_from_cpu", &ZeroCopyTensorCreate<int32_t>)
      .def("copy_from_cpu", &ZeroCopyTensorCreate<int64_t>)
@@ -860,9 +899,11 @@ void BindZeroCopyTensor(py::module *m) {

 void BindPaddleInferTensor(py::module *m) {
  py::class_<paddle_infer::Tensor>(*m, "PaddleInferTensor")
-      .def("reshape", py::overload_cast<const std::vector<int> &>(
+      .def("reshape",
+           py::overload_cast<const std::vector<int> &>(
               &paddle_infer::Tensor::Reshape))
-      .def("reshape", py::overload_cast<const std::size_t &>(
+      .def("reshape",
+           py::overload_cast<const std::size_t &>(
               &paddle_infer::Tensor::ReshapeStrings))
      .def("copy_from_cpu_bind", &PaddleInferTensorCreate<int32_t>)
      .def("copy_from_cpu_bind", &PaddleInferTensorCreate<int64_t>)
@@ -881,7 +922,8 @@ void BindPaddleInferTensor(py::module *m) {
 void BindPredictorPool(py::module *m) {
  py::class_<paddle_infer::services::PredictorPool>(*m, "PredictorPool")
      .def(py::init<const paddle_infer::Config &, size_t>())
-      .def("retrive", &paddle_infer::services::PredictorPool::Retrive,
+      .def("retrive",
+           &paddle_infer::services::PredictorPool::Retrive,
           py::return_value_policy::reference);
 }

@@ -904,7 +946,8 @@ void BindPaddlePassBuilder(py::module *m) {
      .def("append_analysis_pass", &PaddlePassBuilder::AppendAnalysisPass)
      .def("turn_on_debug", &PaddlePassBuilder::TurnOnDebug)
      .def("debug_string", &PaddlePassBuilder::DebugString)
-      .def("all_passes", &PaddlePassBuilder::AllPasses,
+      .def("all_passes",
+           &PaddlePassBuilder::AllPasses,
           py::return_value_policy::reference)
      .def("analysis_passes", &PaddlePassBuilder::AnalysisPasses);