From b62b384bca862dc54d8ec7fd7abadfe016f670c7 Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Fri, 9 Jun 2023 10:19:08 +0800
Subject: [PATCH] refine xpu inference api (#54342)

---
 .../xpu/fused_multi_transformer_xpu_pass.cc   |   8 +-
 paddle/fluid/inference/analysis/argument.h    |  61 ++++++-
 .../inference/analysis/ir_pass_manager.cc     |  53 ++++--
 .../analysis/ir_passes/lite_subgraph_pass.cc  |  43 +++--
 paddle/fluid/inference/api/analysis_config.cc | 153 ++++++++++++------
 .../fluid/inference/api/analysis_predictor.cc | 106 +++++++-----
 .../inference/api/paddle_analysis_config.h    | 134 +++++++++++----
 .../inference/api/paddle_inference_api.h      |   1 +
 paddle/fluid/inference/capi_exp/pd_config.cc  |  24 +--
 paddle/fluid/inference/goapi/config.go        |  25 ++-
 paddle/fluid/inference/lite/engine.cc         |  16 +-
 paddle/fluid/inference/lite/engine.h          |  14 +-
 paddle/fluid/inference/paddle_inference.map   |   1 +
 paddle/fluid/pybind/inference_api.cc          |  56 +++++--
 python/paddle/inference/__init__.py           |   2 +
 test/cpp/inference/api/CMakeLists.txt         |   9 ++
 .../inference/api/xpu_config_resnet50_test.cc | 103 ++++++++++++
 .../api/xpu_runtime_config_resnet50_test.cc   |   6 +-
 18 files changed, 592 insertions(+), 223 deletions(-)
 create mode 100644 test/cpp/inference/api/xpu_config_resnet50_test.cc
diff --git a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc
index 5676465e713..7f9885eee10 100644
--- a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc
@@ -367,8 +367,10 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
                                                  with_time_step,
                                                  with_seq_lengths,
                                                  with_src_mask);
-  int quant_weight_bits =
-      Has("quant_weight_bits") ? Get<int>("quant_weight_bits") : -1;
+  int quant_post_dynamic_weight_precision =
+      Has("quant_post_dynamic_weight_precision ")
+          ? Get<int>("quant_post_dynamic_weight_precision ")
+          : -1;
 
   int found_subgraph_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@@ -421,7 +423,7 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
             w_node,
             nullptr,
             platform::errors::Fatal("w node should not be nullptr"));
-        if (quant_weight_bits == 8) {
+        if (quant_post_dynamic_weight_precision == 0) {
           PrepareWeight<int8_t>(
               graph, scope, block, w_node, &w_intx, &w_max, need_transpose);
         } else {
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 27854efb0da..2b5b066a3a3 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -93,6 +93,25 @@ struct Argument {
  private:                                                                \
   type__ field__##_;
 
+#define DECL_POINTER_ARGUMENT_FIELD(field__, Field, type__)              \
+ public:                                                                 \
+  type__& field__() {                                                    \
+    PADDLE_ENFORCE_EQ(                                                   \
+        Has(#field__),                                                   \
+        true,                                                            \
+        platform::errors::PreconditionNotMet("There is no such field")); \
+    return field__##_;                                                   \
+  }                                                                      \
+  void Set##Field(type__ x) {                                            \
+    field__##_ = x;                                                      \
+    valid_fields_.insert(#field__);                                      \
+  }                                                                      \
+  DECL_ARGUMENT_FIELD_VALID(field__);                                    \
+  type__* field__##_ptr() { return &field__##_; }                        \
+                                                                         \
+ private:                                                                \
+  type__ field__##_;
+
 #define DECL_ARGUMENT_FIELD_VALID(field__) \
   bool field__##_valid() { return Has(#field__); }
 
@@ -276,20 +295,48 @@ struct Argument {
   DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);
 
   DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
-  DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
   DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool);
-  DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool);
-  DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string);
   DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
-  DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool);
-  DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
   DECL_ARGUMENT_FIELD(xpu_enable_multi_stream, XpuEnableMultiStream, bool);
-  DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_bits,
-                      XpuQuantPostDynamicWeightBits,
+  // XpuConfig
+  DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
+  DECL_ARGUMENT_FIELD(xpu_l3_size, XpuL3Size, size_t);
+  DECL_POINTER_ARGUMENT_FIELD(xpu_l3_ptr, XpuL3Ptr, void*);
+  DECL_ARGUMENT_FIELD(xpu_l3_autotune_size, XpuL3AutotuneSize, size_t);
+  DECL_POINTER_ARGUMENT_FIELD(xpu_stream, XpuStream, void*);
+  DECL_ARGUMENT_FIELD(xpu_conv_autotune_level, XpuConvAutotuneLevel, int);
+  DECL_ARGUMENT_FIELD(xpu_conv_autotune_file, XpuConvAutotuneFile, std::string);
+  DECL_ARGUMENT_FIELD(xpu_conv_autotune_file_writeback,
+                      XpuConvAutotuneFileWriteback,
+                      bool);
+  DECL_ARGUMENT_FIELD(xpu_fc_autotune_level, XpuFcAutotuneLevel, int);
+  DECL_ARGUMENT_FIELD(xpu_fc_autotune_file, XpuFcAutotuneFile, std::string);
+  DECL_ARGUMENT_FIELD(xpu_fc_autotune_file_writeback,
+                      XpuFcAutotuneFileWriteback,
+                      bool);
+  DECL_ARGUMENT_FIELD(xpu_gemm_compute_precision, XpuGemmComputePrecision, int);
+  DECL_ARGUMENT_FIELD(xpu_transformer_softmax_optimize_level,
+                      XpuTransformerSoftmaxOptimizeLevel,
+                      int);
+  DECL_ARGUMENT_FIELD(xpu_transformer_encoder_adaptive_seqlen,
+                      XpuTransformerEncoderAdaptiveSeqlen,
+                      bool);
+  DECL_ARGUMENT_FIELD(xpu_quant_post_static_gelu_out_threshold,
+                      XpuQuantPostStaticGeluOutThreshold,
+                      float);
+  DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_activation_method,
+                      XpuQuantPostDynamicActivationMethod,
+                      int);
+  DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_precision,
+                      XpuQuantPostDynamicWeightPrecision,
                       int);
   DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_op_types,
                       XpuQuantPostDynamicOpTypes,
                       std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(xpu_lite_l3_locked, XpuLiteL3Locked, bool);
+  DECL_ARGUMENT_FIELD(xpu_lite_enable_multi_stream,
+                      XpuLiteEnableMultiStream,
+                      bool);
 
   DECL_ARGUMENT_FIELD(use_opencl, UseOpenCL, bool);
 
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 345c0c24289..d6936684165 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -267,20 +267,41 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("enable_int8", new bool(lite_enable_int8));
       pass->Set("use_gpu", new bool(argument->use_gpu()));
       pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
-      pass->Set("xpu_l3_workspace_size",
-                new int(argument->xpu_l3_workspace_size()));
+      pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
+      pass->Set("xpu_l3_size", new size_t(argument->xpu_l3_size()));
+      pass->Set("xpu_l3_ptr", new void *(argument->xpu_l3_ptr()));
+      pass->Set("xpu_l3_autotune_size",
+                new size_t(argument->xpu_l3_autotune_size()));
+      pass->Set("xpu_stream", new void *(argument->xpu_stream()));
+      pass->Set("xpu_conv_autotune_level",
+                new int(argument->xpu_conv_autotune_level()));
+      pass->Set("xpu_conv_autotune_file",
+                new std::string(argument->xpu_conv_autotune_file()));
+      pass->Set("xpu_conv_autotune_file_writeback",
+                new bool(argument->xpu_conv_autotune_file_writeback()));
+      pass->Set("xpu_fc_autotune_level",
+                new int(argument->xpu_fc_autotune_level()));
+      pass->Set("xpu_fc_autotune_file",
+                new std::string(argument->xpu_fc_autotune_file()));
+      pass->Set("xpu_fc_autotune_file_writeback",
+                new bool(argument->xpu_fc_autotune_file_writeback()));
+      pass->Set("xpu_gemm_compute_precision",
+                new int(argument->xpu_gemm_compute_precision()));
+      pass->Set("xpu_transformer_softmax_optimize_level",
+                new int(argument->xpu_transformer_softmax_optimize_level()));
+      pass->Set("xpu_transformer_encoder_adaptive_seqlen",
+                new bool(argument->xpu_transformer_encoder_adaptive_seqlen()));
+      pass->Set(
+          "xpu_quant_post_static_gelu_out_threshold",
+          new float(argument->xpu_quant_post_static_gelu_out_threshold()));
+      pass->Set("xpu_quant_post_dynamic_activation_method",
+                new int(argument->xpu_quant_post_dynamic_activation_method()));
+      pass->Set("xpu_l3_locked", new bool(argument->xpu_lite_l3_locked()));
+      pass->Set("xpu_enable_multi_stream",
+                new bool(argument->xpu_lite_enable_multi_stream()));
       pass->Set("use_opencl", new bool(argument->use_opencl()));
       pass->Set("cpu_math_library_num_threads",
                 new int(argument->cpu_math_library_num_threads()));
-      pass->Set("locked", new bool(argument->xpu_locked()));
-      pass->Set("autotune", new bool(argument->xpu_autotune()));
-      pass->Set("autotune_file",
-                new std::string(argument->xpu_autotune_file()));
-      pass->Set("precision", new std::string(argument->xpu_precision()));
-      pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen()));
-      pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
-      pass->Set("enable_multi_stream",
-                new bool(argument->xpu_enable_multi_stream()));
       // NNAdapter Related
       pass->Set("use_nnadapter", new bool(argument->use_nnadapter()));
       pass->Set("nnadapter_model_cache_dir",
@@ -313,12 +334,10 @@ void IRPassManager::CreatePasses(Argument *argument,
       bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
       pass->Set("use_fc_padding", new bool(use_fc_padding));
     } else if (pass_name == "fused_multi_transformer_xpu_pass") {
-      auto op_types = argument->xpu_quant_post_dynamic_op_types();
-      if (std::count(op_types.begin(),
-                     op_types.end(),
-                     "fused_multi_transformer") > 0) {
-        pass->Set("quant_weight_bits",
-                  new int(argument->xpu_quant_post_dynamic_weight_bits()));
+      int quant_post_dynamic_weight_precision =
+          argument->xpu_quant_post_dynamic_weight_precision();
+      if (quant_post_dynamic_weight_precision == 0) {
+        pass->Set("quant_post_dynamic_weight_precision ", new int(0));
       }
     }
     pre_pass = pass_name;
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index 45f0c589a7e..70130f67056 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -249,17 +249,27 @@ void LiteSubgraphPass::SetUpEngine(
 
   bool use_gpu = Get<bool>("use_gpu");
   bool enable_int8 = Get<bool>("enable_int8");
-  bool use_xpu = Get<bool>("use_xpu");
-  int xpu_device_id = Get<int>("xpu_device_id");
-  int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
   bool use_opencl = Get<bool>("use_opencl");
   int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
-  bool locked = Get<bool>("locked");
-  bool autotune = Get<bool>("autotune");
-  std::string autotune_file = Get<std::string>("autotune_file");
-  std::string precision = Get<std::string>("precision");
-  bool adaptive_seqlen = Get<bool>("adaptive_seqlen");
-  bool enable_multi_stream = Get<bool>("enable_multi_stream");
+  bool use_xpu = Get<bool>("use_xpu");
+  int xpu_device_id = Get<int>("xpu_device_id");
+  size_t xpu_l3_size = Get<size_t>("xpu_l3_size");
+  bool xpu_l3_locked = Get<bool>("xpu_l3_locked");
+  bool xpu_conv_autotune = Get<int>("xpu_conv_autotune_level") > 0;
+  std::string xpu_conv_autotune_file =
+      Get<std::string>("xpu_conv_autotune_file");
+  int xpu_gemm_compute_precision = Get<int>("xpu_gemm_compute_precision");
+  std::string xpu_transformer_encoder_precision{"int16"};
+  if (xpu_gemm_compute_precision == 0) {
+    xpu_transformer_encoder_precision = "int8";
+  } else if (xpu_gemm_compute_precision == 1) {
+    xpu_transformer_encoder_precision = "int16";
+  } else if (xpu_gemm_compute_precision == 2) {
+    xpu_transformer_encoder_precision = "int31";
+  }
+  bool xpu_transformer_encoder_adaptive_seqlen =
+      Get<bool>("xpu_transformer_encoder_adaptive_seqlen");
+  bool xpu_enable_multi_stream = Get<bool>("xpu_enable_multi_stream");
   // NNAdapter Related
   bool use_nnadapter = Get<bool>("use_nnadapter");
   std::string nnadapter_model_cache_dir =
@@ -344,14 +354,15 @@ void LiteSubgraphPass::SetUpEngine(
   }
 
   config.cpu_math_library_num_threads = cpu_math_library_num_threads;
-  config.xpu_l3_workspace_size = xpu_l3_workspace_size;
+  config.xpu_l3_size = xpu_l3_size;
   config.device_id = xpu_device_id;
-  config.locked = locked;
-  config.autotune = autotune;
-  config.autotune_file = autotune_file;
-  config.precision = precision;
-  config.adaptive_seqlen = adaptive_seqlen;
-  config.enable_multi_stream = enable_multi_stream;
+  config.xpu_l3_locked = xpu_l3_locked;
+  config.xpu_conv_autotune = xpu_conv_autotune;
+  config.xpu_conv_autotune_file = xpu_conv_autotune_file;
+  config.xpu_transformer_encoder_precision = xpu_transformer_encoder_precision;
+  config.xpu_transformer_encoder_adaptive_seqlen =
+      xpu_transformer_encoder_adaptive_seqlen;
+  config.xpu_enable_multi_stream = xpu_enable_multi_stream;
   // NNAdapter Related
   config.nnadapter_model_cache_dir = nnadapter_model_cache_dir;
   config.nnadapter_device_names = nnadapter_device_names;
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index a98d694c861..e05c1f0ca9b 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -172,22 +172,34 @@ void AnalysisConfig::DisableFCPadding() {
   Update();
 }
 
-void AnalysisConfig::EnableXpu(int l3_workspace_size,
-                               bool locked,
-                               bool autotune,
-                               const std::string &autotune_file,
-                               const std::string &precision,
-                               bool adaptive_seqlen,
+void AnalysisConfig::EnableXpu(int l3_size,
+                               bool l3_locked,
+                               bool conv_autotune,
+                               const std::string &conv_autotune_file,
+                               const std::string &transformer_encoder_precision,
+                               bool transformer_encoder_adaptive_seqlen,
                                bool enable_multi_stream) {
+#ifdef PADDLE_WITH_XPU
   use_xpu_ = true;
-  xpu_l3_workspace_size_ = l3_workspace_size;
-  xpu_locked_ = locked;
-  xpu_autotune_ = autotune;
-  xpu_autotune_file_ = autotune_file;
-  xpu_precision_ = precision;
-  xpu_adaptive_seqlen_ = adaptive_seqlen;
-  xpu_enable_multi_stream_ = enable_multi_stream;
+  xpu_config_.l3_size = l3_size;
+  xpu_config_.conv_autotune_level = conv_autotune;
+  xpu_config_.conv_autotune_file = conv_autotune_file;
+  if (transformer_encoder_precision == "int8") {
+    xpu_config_.gemm_compute_precision = 0;
+  } else if (transformer_encoder_precision == "int16") {
+    xpu_config_.gemm_compute_precision = 1;
+  } else if (transformer_encoder_precision == "int31") {
+    xpu_config_.gemm_compute_precision = 2;
+  }
+  xpu_config_.transformer_encoder_adaptive_seqlen =
+      transformer_encoder_adaptive_seqlen;
+  xpu_lite_l3_locked_ = l3_locked;
+  xpu_lite_enable_multi_stream_ = enable_multi_stream;
   Update();
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "To use XPU inference, please compile with option 'WITH_XPU' first."));
+#endif
 }
 
 void AnalysisConfig::SetXpuDeviceId(int device_id) {
@@ -195,15 +207,22 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) {
                     true,
                     platform::errors::PreconditionNotMet(
                         "Should call EnableXpu before SetXpuDeviceId."));
-  xpu_device_id_ = device_id;
+  xpu_config_.device_id = device_id;
   Update();
 }
 
-void AnalysisConfig::SetXpuConfig(
-    int quant_post_dynamic_weight_bits,
-    const std::vector<std::string> &quant_post_dynamic_op_types) {
-  xpu_quant_post_dynamic_weight_bits_ = quant_post_dynamic_weight_bits;
-  xpu_quant_post_dynamic_op_types_ = quant_post_dynamic_op_types;
+void AnalysisConfig::SetXpuConfig(const XpuConfig &config) {
+  PADDLE_ENFORCE(use_xpu_,
+                 platform::errors::PreconditionNotMet(
+                     "Should call EnableXpu before SetXpuConfig."));
+  PADDLE_ENFORCE_LE(
+      config.l3_autotune_size,
+      config.l3_size,
+      phi::errors::InvalidArgument(
+          "l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
+          config.l3_autotune_size,
+          config.l3_size));
+  xpu_config_ = config;
   Update();
 }
 
@@ -494,16 +513,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
   // XPU related.
   CP_MEMBER(use_xpu_);
-  CP_MEMBER(xpu_device_id_);
-  CP_MEMBER(xpu_l3_workspace_size_);
-  CP_MEMBER(xpu_locked_);
-  CP_MEMBER(xpu_autotune_);
-  CP_MEMBER(xpu_autotune_file_);
-  CP_MEMBER(xpu_precision_);
-  CP_MEMBER(xpu_adaptive_seqlen_);
-  CP_MEMBER(xpu_enable_multi_stream_);
-  CP_MEMBER(xpu_quant_post_dynamic_weight_bits_);
-  CP_MEMBER(xpu_quant_post_dynamic_op_types_);
+  CP_MEMBER(xpu_config_);
+  CP_MEMBER(xpu_lite_l3_locked_);
+  CP_MEMBER(xpu_lite_enable_multi_stream_);
 
   // Lite OpenCL Related
   CP_MEMBER(use_opencl_);
@@ -1033,7 +1045,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << exec_stream_;
   ss << use_fc_padding_;
   ss << gpu_device_id_;
-  ss << xpu_device_id_;
   ss << memory_pool_init_size_mb_;
 
   ss << use_tensorrt_;
@@ -1080,17 +1091,26 @@ std::string AnalysisConfig::SerializeInfoCache() {
 
   ss << use_lite_;
   ss << use_xpu_;
-  ss << xpu_l3_workspace_size_;
-  ss << xpu_locked_;
-  ss << xpu_autotune_;
-  ss << xpu_autotune_file_;
-  ss << xpu_precision_;
-  ss << xpu_adaptive_seqlen_;
-  ss << xpu_enable_multi_stream_;
-  ss << xpu_quant_post_dynamic_weight_bits_;
-  for (auto op_type : xpu_quant_post_dynamic_op_types_) {
-    ss << op_type;
-  }
+  ss << xpu_config_.device_id;
+  ss << xpu_config_.l3_size;
+  ss << xpu_config_.l3_ptr;
+  ss << xpu_config_.l3_autotune_size;
+  ss << xpu_config_.stream;
+  ss << xpu_config_.conv_autotune_level;
+  ss << xpu_config_.conv_autotune_file;
+  ss << xpu_config_.conv_autotune_file_writeback;
+  ss << xpu_config_.fc_autotune_level;
+  ss << xpu_config_.fc_autotune_file;
+  ss << xpu_config_.fc_autotune_file_writeback;
+  ss << xpu_config_.gemm_compute_precision;
+  ss << xpu_config_.transformer_softmax_optimize_level;
+  ss << xpu_config_.transformer_encoder_adaptive_seqlen;
+  ss << xpu_config_.quant_post_static_gelu_out_threshold;
+  ss << xpu_config_.quant_post_dynamic_activation_method;
+  ss << xpu_config_.quant_post_dynamic_weight_precision;
+  for (auto type : xpu_config_.quant_post_dynamic_op_types) ss << type;
+  ss << xpu_lite_l3_locked_;
+  ss << xpu_lite_enable_multi_stream_;
 
   ss << thread_local_stream_;
 
@@ -1318,16 +1338,49 @@ std::string AnalysisConfig::Summary() {
   // xpu info
   os.InsertRow({"use_xpu", use_xpu_ ? "true" : "false"});
   if (use_xpu_) {
-    os.InsertRow({"xpu_device_id", std::to_string(xpu_device_id_)});
+    os.InsertRow({"xpu_device_id", std::to_string(xpu_config_.device_id)});
+    os.InsertRow({"xpu_l3_size", std::to_string(xpu_config_.l3_size)});
     os.InsertRow(
-        {"xpu_l3_workspace_size", std::to_string(xpu_l3_workspace_size_)});
-    os.InsertRow({"xpu_quant_post_dynamic_weight_bits",
-                  std::to_string(xpu_quant_post_dynamic_weight_bits_)});
-    std::vector<std::string> op_types{"xpu_quant_post_dynamic_op_types"};
-    for (auto op_type : xpu_quant_post_dynamic_op_types_) {
-      op_types.push_back(op_type);
-    }
-    os.InsertRow(op_types);
+        {"xpu_l3_ptr",
+         std::to_string(reinterpret_cast<int64_t>(xpu_config_.l3_ptr))});
+    os.InsertRow(
+        {"xpu_l3_autotune_size", std::to_string(xpu_config_.l3_autotune_size)});
+    os.InsertRow(
+        {"xpu_stream",
+         std::to_string(reinterpret_cast<int64_t>(xpu_config_.stream))});
+    os.InsertRow({"xpu_conv_autotune_level",
+                  std::to_string(xpu_config_.conv_autotune_level)});
+    os.InsertRow({"xpu_conv_autotune_file", xpu_config_.conv_autotune_file});
+    os.InsertRow({"xpu_conv_autotune_file_writeback",
+                  std::to_string(xpu_config_.conv_autotune_file_writeback)});
+    os.InsertRow({"xpu_fc_autotune_level",
+                  std::to_string(xpu_config_.fc_autotune_level)});
+    os.InsertRow({"xpu_fc_autotune_file", xpu_config_.fc_autotune_file});
+    os.InsertRow({"xpu_fc_autotune_file_writeback",
+                  std::to_string(xpu_config_.fc_autotune_file_writeback)});
+    os.InsertRow({"xpu_gemm_compute_precision",
+                  std::to_string(xpu_config_.gemm_compute_precision)});
+    os.InsertRow(
+        {"xpu_transformer_softmax_optimize_level",
+         std::to_string(xpu_config_.transformer_softmax_optimize_level)});
+    os.InsertRow(
+        {"xpu_transformer_encoder_adaptive_seqlen",
+         std::to_string(xpu_config_.transformer_encoder_adaptive_seqlen)});
+    os.InsertRow(
+        {"xpu_quant_post_static_gelu_out_threshold",
+         std::to_string(xpu_config_.quant_post_static_gelu_out_threshold)});
+    os.InsertRow(
+        {"xpu_quant_post_dynamic_activation_method",
+         std::to_string(xpu_config_.quant_post_dynamic_activation_method)});
+    os.InsertRow(
+        {"xpu_quant_post_dynamic_weight_precision ",
+         std::to_string(xpu_config_.quant_post_dynamic_weight_precision)});
+    std::vector<std::string> quant_post_dynamic_op_types_info =
+        xpu_config_.quant_post_dynamic_op_types;
+    quant_post_dynamic_op_types_info.insert(
+        quant_post_dynamic_op_types_info.begin(),
+        "xpu_quant_post_dynamic_op_types");
+    os.InsertRow(quant_post_dynamic_op_types_info);
   }
   os.InsetDivider();
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 4fcad3c7c11..c43ee1df887 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -389,7 +389,7 @@ bool AnalysisPredictor::Init(
   }
 #endif
 #if defined(PADDLE_WITH_XPU)
-  if (config_.use_xpu_) {
+  if (config_.use_xpu_ && !config_.use_lite_) {
     private_context_ = true;
     if (!status_is_cloned_ && config_.external_stream_enabled()) {
       predictor_stream_ = config_.GetExecStream();
@@ -1418,14 +1418,8 @@ void AnalysisPredictor::PrepareArgument() {
     argument_->SetLitePassesFilter(config_.lite_passes_filter_);
     argument_->SetLiteOpsFilter(config_.lite_ops_filter_);
     argument_->SetLiteZeroCopy(config_.lite_zero_copy_);
-    argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
-    argument_->SetXpuLocked(config_.xpu_locked_);
-    argument_->SetXpuAutotune(config_.xpu_autotune_);
-    argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_);
-    argument_->SetXpuPrecision(config_.xpu_precision_);
-    argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
-    argument_->SetXpuDeviceId(config_.xpu_device_id_);
-    argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
+    argument_->SetXpuLocked(config_.xpu_lite_l3_locked_);
+    argument_->SetXpuEnableMultiStream(config_.xpu_lite_enable_multi_stream_);
     argument_->SetUseOpenCL(config_.use_opencl_);
     // NNAdapter related
     argument_->SetUseNNAdapter(config_.NNAdapter().use_nnadapter);
@@ -1506,21 +1500,36 @@ void AnalysisPredictor::PrepareArgument() {
   }
 #endif
 
-#ifdef PADDLE_WITH_XPU
   argument_->SetUseXpu(config_.use_xpu_);
-  argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
-  argument_->SetXpuLocked(config_.xpu_locked_);
-  argument_->SetXpuAutotune(config_.xpu_autotune_);
-  argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_);
-  argument_->SetXpuPrecision(config_.xpu_precision_);
-  argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
-  argument_->SetXpuDeviceId(config_.xpu_device_id_);
-  argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
-  argument_->SetXpuQuantPostDynamicWeightBits(
-      config_.xpu_quant_post_dynamic_weight_bits_);
+  argument_->SetXpuDeviceId(config_.xpu_config_.device_id);
+  argument_->SetXpuL3Size(config_.xpu_config_.l3_size);
+  argument_->SetXpuL3Ptr(config_.xpu_config_.l3_ptr);
+  argument_->SetXpuL3AutotuneSize(config_.xpu_config_.l3_autotune_size);
+  argument_->SetXpuStream(config_.xpu_config_.stream);
+  argument_->SetXpuConvAutotuneLevel(config_.xpu_config_.conv_autotune_level);
+  argument_->SetXpuConvAutotuneFile(config_.xpu_config_.conv_autotune_file);
+  argument_->SetXpuConvAutotuneFileWriteback(
+      config_.xpu_config_.conv_autotune_file_writeback);
+  argument_->SetXpuFcAutotuneLevel(config_.xpu_config_.fc_autotune_level);
+  argument_->SetXpuFcAutotuneFile(config_.xpu_config_.fc_autotune_file);
+  argument_->SetXpuFcAutotuneFileWriteback(
+      config_.xpu_config_.fc_autotune_file_writeback);
+  argument_->SetXpuGemmComputePrecision(
+      config_.xpu_config_.gemm_compute_precision);
+  argument_->SetXpuTransformerSoftmaxOptimizeLevel(
+      config_.xpu_config_.transformer_softmax_optimize_level);
+  argument_->SetXpuTransformerEncoderAdaptiveSeqlen(
+      config_.xpu_config_.transformer_encoder_adaptive_seqlen);
+  argument_->SetXpuQuantPostStaticGeluOutThreshold(
+      config_.xpu_config_.quant_post_static_gelu_out_threshold);
+  argument_->SetXpuQuantPostDynamicActivationMethod(
+      config_.xpu_config_.quant_post_dynamic_activation_method);
+  argument_->SetXpuQuantPostDynamicWeightPrecision(
+      config_.xpu_config_.quant_post_dynamic_weight_precision);
   argument_->SetXpuQuantPostDynamicOpTypes(
-      config_.xpu_quant_post_dynamic_op_types_);
-#endif
+      config_.xpu_config_.quant_post_dynamic_op_types);
+  argument_->SetXpuLiteL3Locked(config_.xpu_lite_l3_locked_);
+  argument_->SetXpuLiteEnableMultiStream(config_.xpu_lite_enable_multi_stream_);
 
   auto *pass_builder = config_.pass_builder();
   // TODO(inference): Need to reconstruct the pass_builder, pass should be
@@ -2076,9 +2085,36 @@ bool AnalysisPredictor::ZeroCopyRun() {
   }
 #endif
 
+#ifdef PADDLE_WITH_XPU
+  InferXPUContext *infer_xpu_ctx = nullptr;
+  if (config_.use_xpu_ && !config_.use_lite_) {
+    PADDLE_ENFORCE(
+        private_context_,
+        paddle::platform::errors::Fatal(
+            "Must use private context if run predictor on xpu place."));
+    auto *dev_ctxs = reinterpret_cast<const std::map<
+        phi::Place,
+        std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
+        this->GetDeviceContexts());
+    infer_xpu_ctx =
+        static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
+    infer_xpu_ctx->SetStream(predictor_stream_);
+    infer_xpu_ctx->SetL3Info(config_.xpu_config_.l3_size,
+                             config_.xpu_config_.l3_ptr,
+                             config_.xpu_config_.l3_autotune_size,
+                             place_);
+  }
+#endif
+
   executor_->Run();
   inference::DisplayMemoryInfo(place_, "after run");
 
+#ifdef PADDLE_WITH_XPU
+  if (config_.use_xpu_ && !config_.use_lite_ && infer_xpu_ctx != nullptr) {
+    infer_xpu_ctx->L3CacheAutotune();
+  }
+#endif
+
   if (config_.shape_range_info_collected()) {
     CollectShapeRangeInfo();
   }
@@ -2148,18 +2184,6 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
 
 bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
 #ifdef PADDLE_WITH_XPU
-  PADDLE_ENFORCE(
-      private_context_,
-      paddle::platform::errors::Fatal(
-          "Must use private context if run predictor with external config."));
-
-  auto *dev_ctxs = reinterpret_cast<const std::map<
-      phi::Place,
-      std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
-      this->GetDeviceContexts());
-  auto *dev_ctx =
-      static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
-
   auto xpu_runtime_config =
       reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
   auto *stream = xpu_runtime_config->stream;
@@ -2167,12 +2191,10 @@ bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
     paddle::platform::XPUStreamSync(
         static_cast<paddle::xpuStream>(predictor_stream_));
     predictor_stream_ = stream;
-    dev_ctx->SetStream(stream);
   }
 
-  size_t l3_size = xpu_runtime_config->l3_size;
-  void *l3_ptr = xpu_runtime_config->l3_ptr;
-  size_t l3_autotune_size = xpu_runtime_config->l3_autotune_size;
+  auto l3_size = xpu_runtime_config->l3_size;
+  auto l3_autotune_size = xpu_runtime_config->l3_autotune_size;
   PADDLE_ENFORCE_LE(
       l3_autotune_size,
       l3_size,
@@ -2180,11 +2202,11 @@ bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
           "l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
           l3_autotune_size,
           l3_size));
-  dev_ctx->SetL3Info(l3_size, l3_ptr, l3_autotune_size, place_);
+  config_.xpu_config_.l3_size = l3_size;
+  config_.xpu_config_.l3_ptr = xpu_runtime_config->l3_ptr;
+  config_.xpu_config_.l3_autotune_size = l3_autotune_size;
 
-  bool ret = ZeroCopyRun();
-  dev_ctx->L3CacheAutotune();
-  return ret;
+  return ZeroCopyRun();
 #endif
   return false;
 }
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 8f5c51d3819..11ba4feaecb 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -76,6 +76,77 @@ struct LiteNNAdapterConfig {
   LiteNNAdapterConfig& Disable();
 };
 
+struct PD_INFER_DECL XpuConfig {
+  // Select which xpu device to run model.
+  int device_id{0};
+
+  // Available l3 size (Byte)
+  // For kunlun1, max l3_size is 16773120 Byte
+  // For kunlun2, max l3_size is 67104768 Byte
+  size_t l3_size{0};
+  // If l3_ptr is not nullptr, it is used as l3 buffer.
+  // If l3_ptr is nullptr, new l3 buffer will be created.
+  void* l3_ptr{nullptr};
+  // Available l3 size for autotune.
+  // If l3_autotune_size is 0, autotune is closed.
+  // Note: The remaining l3 size (l3_size - l3_autotune_size) is for
+  // kernels (both paddle/xdnn kernels)
+  size_t l3_autotune_size{0};
+
+  // Stream for execution.
+  // If stream is nullptr, default stream will be used.
+  void* stream{nullptr};
+
+  // Conv autotune level. Default 0 means no autotune.
+  // Note: Paddle-Lite only.
+  int conv_autotune_level{0};
+  // Base conv autotune info is read from conv_autotune_file.
+  // Note: Paddle-Lite only.
+  std::string conv_autotune_file;
+  // Whether write new conv autotune info to conv_autotune_file.
+  // Note: Paddle-Lite only.
+  bool conv_autotune_file_writeback{false};
+
+  // Fc autotune level. The Optional values are 0-9. Default 0 means no
+  // autotune. Note: Paddle-Lite only.
+  int fc_autotune_level{0};
+  // Base fc autotune info is read from fc_autotune_file.
+  // Note: Paddle-Lite only.
+  std::string fc_autotune_file;
+  // Whether write new fc autotune info to fc_autotune_file.
+  // Note: Paddle-Lite only.
+  bool fc_autotune_file_writeback{false};
+
+  // Gemm compute precision. Optional values are 0(int8),1(int16),2(int31).
+  // Note: "gemm_compute_precision" has no effect on quanted ops of quant model
+  // Note: Paddle-Lite only.
+  int gemm_compute_precision{1};
+  // Which method to optimize softmax in transformer structure. Optional values
+  // are 0,1,2. Note: Paddle-Lite only.
+  int transformer_softmax_optimize_level{0};
+  // Whether enable adaptive_seqlen optimize on transformer encoder.
+  // Note: Paddle-Lite only.
+  bool transformer_encoder_adaptive_seqlen{true};
+
+  // Gelu out max threshold is limited to quant_post_static_gelu_out_threshold
+  // if use static post-quantization.
+  // Note: Paddle-Lite only.
+  float quant_post_static_gelu_out_threshold{10.f};
+  // Activation method if use dynamic post-quantization.
+  // For kunlun1, optional values are 0(per_tensor),1(per_batch),2(per_head).
+  // For kunlun2, optional values are 0(per_tensor) or non-zero(every_16).
+  // Note: Paddle-Lite only.
+  int quant_post_dynamic_activation_method{0};
+  // Preprocess weight to quant_post_dynamic_weight_precision if use dynamic
+  // post-quantization. Optional values is 0,1,2.
+  // * If 0, preprocess weight to int8.
+  // * If 1, preprocess weight to int16.
+  // * If 2, preprocess weight to float.
+  // Note: PaddleInference only.
+  int quant_post_dynamic_weight_precision{1};
+  std::vector<std::string> quant_post_dynamic_op_types;
+};
+
 struct DistConfig {
   bool use_dist_model() const { return use_dist_model_; }
   void EnableDistModel(bool use_dist_model) {
@@ -271,42 +342,46 @@ struct PD_INFER_DECL AnalysisConfig {
   /// \brief Turn on XPU.
   ///
   /// \param l3_workspace_size The size of the video memory allocated by the l3
-  ///         cache, the maximum is 16M.
-  /// \param locked Whether the allocated L3 cache can be locked. If false,
+  ///       cache, the maximum is 16M.
+  /// \param l3_locked Whether the allocated L3 cache can be locked. If false,
   ///       it means that the L3 cache is not locked, and the allocated L3
   ///       cache can be shared by multiple models, and multiple models
   ///       sharing the L3 cache will be executed sequentially on the card.
-  /// \param autotune Whether to autotune the conv operator in the model. If
-  ///       true, when the conv operator of a certain dimension is executed
+  /// \param conv_autotune Whether to autotune the conv operator in the model.
+  ///       If true, when the conv operator of a certain dimension is executed
   ///       for the first time, it will automatically search for a better
   ///       algorithm to improve the performance of subsequent conv operators
   ///       of the same dimension.
-  /// \param autotune_file Specify the path of the autotune file. If
+  /// \param conv_autotune_file Specify the path of the autotune file. If
   ///       autotune_file is specified, the algorithm specified in the
   ///       file will be used and autotune will not be performed again.
-  /// \param precision Calculation accuracy of multi_encoder
-  /// \param adaptive_seqlen Is the input of multi_encoder variable length
-  /// \param enable_multi_stream Whether to enable the multi stream of xpu.
-  ///
-  void EnableXpu(int l3_workspace_size = 0xfffc00,
-                 bool locked = false,
-                 bool autotune = true,
-                 const std::string& autotune_file = "",
-                 const std::string& precision = "int16",
-                 bool adaptive_seqlen = false,
+  /// \param transformer_encoder_precision Calculation accuracy of multi_encoder
+  /// \param transformer_encoder_adaptive_seqlen Is the input of multi_encoder
+  ///       variable length
+  /// \param enable_multi_stream Whether to enable the multi
+  ///       stream of xpu.
+  ///
+  void EnableXpu(int l3_size = 0xfffc00,
+                 bool l3_locked = false,
+                 bool conv_autotune = true,
+                 const std::string& conv_autotune_file = "",
+                 const std::string& transformer_encoder_precision = "int16",
+                 bool transformer_encoder_adaptive_seqlen = false,
                  bool enable_multi_stream = false);
 
   ///
   /// \brief configs of XPU
   ///
-  /// \param quant_post_dynamic_weight_bits Weight bits used in dynamic post
-  /// quantization. Optional value: -1, 8, 16. Default value is -1, means using
-  /// the recommended way. \param quant_post_dynamic_op_types Ops used in
-  /// dynamic post quantization.
+  /// \param config Configs for xpu. See XpuConfig for more details.
+  ///
+  void SetXpuConfig(const XpuConfig& config);
+
+  ///
+  /// \brief Get configs of xpu
+  ///
+  /// \return XpuConfig The configs of xpu.
   ///
-  void SetXpuConfig(
-      int quant_post_dynamic_weight_bits = -1,
-      const std::vector<std::string>& quant_post_dynamic_op_types = {});
+  XpuConfig xpu_config() { return xpu_config_; }
 
   ///
   /// \brief configs of IPU
@@ -462,7 +537,7 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   /// \return int The XPU device id.
   ///
-  int xpu_device_id() const { return xpu_device_id_; }
+  int xpu_device_id() const { return xpu_config_.device_id; }
   /// \brief Get the number of IPU device .
   ///
   /// \return int The number of IPU device.
@@ -1191,16 +1266,9 @@ struct PD_INFER_DECL AnalysisConfig {
 
   // XPU related.
   bool use_xpu_{false};
-  int xpu_device_id_{0};
-  int xpu_l3_workspace_size_{0};
-  bool xpu_locked_;
-  bool xpu_autotune_;
-  std::string xpu_autotune_file_;
-  std::string xpu_precision_;
-  bool xpu_adaptive_seqlen_;
-  bool xpu_enable_multi_stream_;
-  int xpu_quant_post_dynamic_weight_bits_{-1};
-  std::vector<std::string> xpu_quant_post_dynamic_op_types_;
+  XpuConfig xpu_config_;
+  bool xpu_lite_l3_locked_{false};
+  bool xpu_lite_enable_multi_stream_{false};
 
   // LITE OPENCL SETTINGS
   bool use_opencl_{false};
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index aa77015ba63..fcf76bb3540 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -47,6 +47,7 @@ namespace paddle_infer {
 using PrecisionType = paddle::AnalysisConfig::Precision;
 using Config = paddle::AnalysisConfig;
 using DistConfig = paddle::DistConfig;
+using XpuConfig = paddle::XpuConfig;
 
 ///
 /// \class Predictor
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
index ed91e0721c4..ca7e03407dd 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -154,20 +154,20 @@ void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) {
 }
 
 void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
-                        int32_t l3_workspace_size,
-                        PD_Bool locked,
-                        PD_Bool autotune,
-                        const char* autotune_file,
-                        const char* precision,
-                        PD_Bool adaptive_seqlen,
+                        int32_t l3_size,
+                        PD_Bool l3_locked,
+                        PD_Bool conv_autotune,
+                        const char* conv_autotune_file,
+                        const char* transformer_encoder_precision,
+                        PD_Bool transformer_encoder_adaptive_seqlen,
                         PD_Bool enable_multi_stream) {
   CHECK_AND_CONVERT_PD_CONFIG;
-  config->EnableXpu(l3_workspace_size,
-                    locked,
-                    autotune,
-                    autotune_file,
-                    precision,
-                    adaptive_seqlen,
+  config->EnableXpu(l3_size,
+                    l3_locked,
+                    conv_autotune,
+                    conv_autotune_file,
+                    transformer_encoder_precision,
+                    transformer_encoder_adaptive_seqlen,
                     enable_multi_stream);
 }
 
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
index 3ed39e94a23..9d0a1e58644 100644
--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -193,23 +193,22 @@ func (config *Config) EnableORTOptimization() {
 ///
 /// \brief Turn on XPU.
 ///
-/// \param l3_workspace_size The size of the video memory allocated by the l3 cache, the maximum is 16M.
-/// \param locked Whether the allocated L3 cache can be locked. If false, it means that the L3 cache is not locked, and the allocated L3 cache can be shared by multiple models, and multiple models sharing the L3 cache will be executed sequentially on the card.
-/// \param autotune Whether to autotune the conv operator in the model. If true, when the conv operator of a certain dimension is executed for the first time, it will automatically search for a better algorithm to improve the performance of subsequent conv operators of the same dimension.
-/// \param autotune_file Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again.
-/// \param precision Calculation accuracy of multi_encoder
-/// \param adaptive_seqlen Is the input of multi_encoder variable length
+/// \param l3Size The size of the video memory allocated by the l3 cache, the maximum is 16M.
+/// \param l3Locked Whether the allocated L3 cache can be locked. If false, it means that the L3 cache is not locked, and the allocated L3 cache can be shared by multiple models, and multiple models sharing the L3 cache will be executed sequentially on the card.
+/// \param convAutotune Whether to autotune the conv operator in the model. If true, when the conv operator of a certain dimension is executed for the first time, it will automatically search for a better algorithm to improve the performance of subsequent conv operators of the same dimension.
+/// \param convAutotuneFile Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again.
+/// \param transformerEencoderPrecision Calculation accuracy of multi_encoder
+/// \param transformerEncoderAdaptiveSeqlen Is the input of multi_encoder variable length
 /// \param enable_multi_stream Whether to enable the multi stream of xpu
 ///
-func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune bool, autotuneFile string, precision string, adaptiveSeqlen bool, enableMultiStream bool) {
-	cAutotuneFile := C.CString(autotuneFile)
-	cPrecision := C.CString(precision)
+func (config *Config) EnableXpu(l3Size int32, l3Locked bool, convAutotune bool, convAutotuneFile string, transformerEencoderPrecision string, transformerEncoderAdaptiveSeqlen bool, enableMultiStream bool) {
+	cConvAutotuneFile := C.CString(convAutotuneFile)
+	cTransformerEencoderPrecision := C.CString(transformerEencoderPrecision)
 	defer func() {
-		C.free(unsafe.Pointer(cAutotuneFile))
-		C.free(unsafe.Pointer(cPrecision))
+		C.free(unsafe.Pointer(cConvAutotuneFile))
+		C.free(unsafe.Pointer(cTransformerEencoderPrecision))
 	}()
-	C.PD_ConfigEnableXpu(config.c, C.int32_t(l3WorkspaceSize), cvtGoBoolToPD(locked), cvtGoBoolToPD(autotune),
-		cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen), cvtGoBoolToPD(enableMultiStream))
+	C.PD_ConfigEnableXpu(config.c, C.int32_t(l3Size), cvtGoBoolToPD(l3Locked), cvtGoBoolToPD(convAutotune), cConvAutotuneFile, cTransformerEencoderPrecision, cvtGoBoolToPD(transformerEncoderAdaptiveSeqlen), cvtGoBoolToPD(enableMultiStream))
 }
 
 ///
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index cc7fbfa64f0..d64888d2887 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -56,16 +56,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
 #endif
 
 #ifdef LITE_SUBGRAPH_WITH_XPU
-  // Deprecated in Paddle-Lite release/v2.8
-  lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
-      cfg.xpu_l3_workspace_size);
-  lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_workspace_size,
-                                          cfg.locked);
-  lite_cxx_config.set_xpu_conv_autotune(cfg.autotune, cfg.autotune_file);
-  lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision,
-                                               cfg.adaptive_seqlen);
+  lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_size, cfg.xpu_l3_locked);
+  lite_cxx_config.set_xpu_conv_autotune(cfg.xpu_conv_autotune,
+                                        cfg.xpu_conv_autotune_file);
+  lite_cxx_config.set_xpu_multi_encoder_method(
+      cfg.xpu_transformer_encoder_precision,
+      cfg.xpu_transformer_encoder_adaptive_seqlen);
   lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id);
-  if (cfg.enable_multi_stream) {
+  if (cfg.xpu_enable_multi_stream) {
     lite_cxx_config.enable_xpu_multi_stream();
   }
 #endif
diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h
index 6e6d9fea5d4..4dfe32b2bc1 100644
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -41,13 +41,13 @@ struct EngineConfig {
   int device_id = 0;
 
   // for xpu
-  size_t xpu_l3_workspace_size;
-  bool locked = false;
-  bool autotune = true;
-  std::string autotune_file = "";
-  std::string precision = "int16";
-  bool adaptive_seqlen = false;
-  bool enable_multi_stream = false;
+  size_t xpu_l3_size{0};
+  bool xpu_l3_locked = false;
+  bool xpu_conv_autotune = true;
+  std::string xpu_conv_autotune_file = "";
+  std::string xpu_transformer_encoder_precision = "int16";
+  bool xpu_transformer_encoder_adaptive_seqlen = false;
+  bool xpu_enable_multi_stream = false;
 
   // for x86 or arm
   int cpu_math_library_num_threads{1};
diff --git a/paddle/fluid/inference/paddle_inference.map b/paddle/fluid/inference/paddle_inference.map
index 69a7288ddc4..93d90238e34 100644
--- a/paddle/fluid/inference/paddle_inference.map
+++ b/paddle/fluid/inference/paddle_inference.map
@@ -21,6 +21,7 @@
 			*paddle::internal*;
 			*paddle::get_version*;
 			*paddle::LiteNNAdapterConfig*;
+			*paddle::XpuConfig*;
 			*paddle::AnalysisConfig::*;
 			*paddle::PaddlePredictor::*;
 			*paddle::CreatePaddlePredictor*;
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 711f99e8748..a7c2c9d580c 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -108,6 +108,7 @@ void BindPaddlePredictor(py::module *m);
 void BindNativeConfig(py::module *m);
 void BindNativePredictor(py::module *m);
 void BindLiteNNAdapterConfig(py::module *m);
+void BindXpuConfig(py::module *m);
 void BindAnalysisConfig(py::module *m);
 void BindAnalysisPredictor(py::module *m);
 void BindZeroCopyTensor(py::module *m);
@@ -476,6 +477,7 @@ void BindInferenceApi(py::module *m) {
   BindNativeConfig(m);
   BindNativePredictor(m);
   BindLiteNNAdapterConfig(m);
+  BindXpuConfig(m);
   BindAnalysisConfig(m);
   BindAnalysisPredictor(m);
   BindPaddleInferPredictor(m);
@@ -756,21 +758,21 @@ void BindAnalysisConfig(py::module *m) {
 #endif
       .def("enable_xpu",
            &AnalysisConfig::EnableXpu,
-           py::arg("l3_workspace_size") = 16 * 1024 * 1024,
-           py::arg("locked") = false,
-           py::arg("autotune") = true,
-           py::arg("autotune_file") = "",
-           py::arg("precision") = "int16",
-           py::arg("adaptive_seqlen") = false,
+           py::arg("l3_size") = 16 * 1024 * 1024,
+           py::arg("l3_locked") = false,
+           py::arg("conv_autotune") = true,
+           py::arg("conv_autotune_file") = "",
+           py::arg("transformer_encoder_precision") = "int16",
+           py::arg("transformer_encoder_adaptive_seqlen") = false,
            py::arg("enable_multi_stream") = false)
       .def("set_xpu_device_id",
            &AnalysisConfig::SetXpuDeviceId,
            py::arg("device_id") = 0)
-      .def(
-          "set_xpu_config",
-          &AnalysisConfig::SetXpuConfig,
-          py::arg("quant_post_dynamic_weight_bits") = -1,
-          py::arg("quant_post_dynamic_op_types") = std::vector<std::string>({}))
+      .def("set_xpu_config",
+           [](AnalysisConfig &self, const paddle_infer::XpuConfig &xpu_config) {
+             self.SetXpuConfig(xpu_config);
+           })
+      .def("xpu_config", &AnalysisConfig::xpu_config)
       .def("enable_custom_device",
            &AnalysisConfig::EnableCustomDevice,
            py::arg("device_type"),
@@ -1000,6 +1002,38 @@ void BindLiteNNAdapterConfig(py::module *m) {
       .def("disable", &LiteNNAdapterConfig::Disable);
 }
 
+void BindXpuConfig(py::module *m) {
+  py::class_<XpuConfig>(*m, "XpuConfig")
+      .def(py::init<>())
+      .def_readwrite("device_id", &XpuConfig::device_id)
+      .def_readwrite("l3_ptr", &XpuConfig::l3_ptr)
+      .def_readwrite("l3_size", &XpuConfig::l3_size)
+      .def_readwrite("l3_autotune_size", &XpuConfig::l3_autotune_size)
+      .def_readwrite("stream", &XpuConfig::stream)
+      .def_readwrite("conv_autotune_level", &XpuConfig::conv_autotune_level)
+      .def_readwrite("conv_autotune_file", &XpuConfig::conv_autotune_file)
+      .def_readwrite("conv_autotune_file_writeback",
+                     &XpuConfig::conv_autotune_file_writeback)
+      .def_readwrite("fc_autotune_level", &XpuConfig::fc_autotune_level)
+      .def_readwrite("fc_autotune_file", &XpuConfig::fc_autotune_file)
+      .def_readwrite("fc_autotune_file_writeback",
+                     &XpuConfig::fc_autotune_file_writeback)
+      .def_readwrite("gemm_compute_precision",
+                     &XpuConfig::gemm_compute_precision)
+      .def_readwrite("transformer_softmax_optimize_level",
+                     &XpuConfig::transformer_softmax_optimize_level)
+      .def_readwrite("transformer_encoder_adaptive_seqlen",
+                     &XpuConfig::transformer_encoder_adaptive_seqlen)
+      .def_readwrite("quant_post_static_gelu_out_threshold",
+                     &XpuConfig::quant_post_static_gelu_out_threshold)
+      .def_readwrite("quant_post_dynamic_activation_method",
+                     &XpuConfig::quant_post_dynamic_activation_method)
+      .def_readwrite("quant_post_dynamic_weight_precision",
+                     &XpuConfig::quant_post_dynamic_weight_precision)
+      .def_readwrite("quant_post_dynamic_op_types",
+                     &XpuConfig::quant_post_dynamic_op_types);
+}
+
 #ifdef PADDLE_WITH_MKLDNN
 void BindMkldnnQuantizerConfig(py::module *m) {
   py::class_<MkldnnQuantizerConfig> quantizer_config(*m,
diff --git a/python/paddle/inference/__init__.py b/python/paddle/inference/__init__.py
index 22747d94f2a..e7120bdf7a4 100644
--- a/python/paddle/inference/__init__.py
+++ b/python/paddle/inference/__init__.py
@@ -30,6 +30,7 @@ from paddle.fluid.core import (
     get_trt_runtime_version,
     get_num_bytes_of_data_type,
     PredictorPool,
+    XpuConfig,
 )
 
 __all__ = [  # noqa
@@ -47,4 +48,5 @@ __all__ = [  # noqa
     'get_trt_runtime_version',
     'get_num_bytes_of_data_type',
     'PredictorPool',
+    'XpuConfig',
 ]
diff --git a/test/cpp/inference/api/CMakeLists.txt b/test/cpp/inference/api/CMakeLists.txt
index 9fdcc74c9a9..f2299fdd44c 100644
--- a/test/cpp/inference/api/CMakeLists.txt
+++ b/test/cpp/inference/api/CMakeLists.txt
@@ -1476,6 +1476,15 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
   endif()
 
   if(WITH_XPU)
+    inference_analysis_test(
+      xpu_config_resnet50_test
+      SRCS
+      xpu_config_resnet50_test.cc
+      EXTRA_DEPS
+      paddle_inference_shared
+      python
+      ARGS
+      --infer_model=${RESNET50_MODEL_DIR})
     inference_analysis_test(
       xpu_runtime_config_resnet50_test
       SRCS
diff --git a/test/cpp/inference/api/xpu_config_resnet50_test.cc b/test/cpp/inference/api/xpu_config_resnet50_test.cc
new file mode 100644
index 00000000000..ce3796e4209
--- /dev/null
+++ b/test/cpp/inference/api/xpu_config_resnet50_test.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cmath>
+#include "gflags/gflags.h"
+#include "test/cpp/inference/api/tester_helper.h"
+
+namespace paddle_infer {
+
+static const std::vector<float> TRUTH_VALUES = {
+    127.779f,  738.165f,  1013.22f,  -438.17f,  366.401f,  927.659f,  736.222f,
+    -633.684f, -329.927f, -430.155f, -633.062f, -146.548f, -1324.28f, -1349.36f,
+    -242.675f, 117.448f,  -801.723f, -391.514f, -404.818f, 454.16f,   515.48f,
+    -133.031f, 69.293f,   590.096f,  -1434.69f, -1070.89f, 307.074f,  400.525f,
+    -316.12f,  -587.125f, -161.056f, 800.363f,  -96.4708f, 748.706f,  868.174f,
+    -447.938f, 112.737f,  1127.2f,   47.4355f,  677.72f,   593.186f,  -336.4f,
+    551.362f,  397.823f,  78.3979f,  -715.398f, 405.969f,  404.256f,  246.019f,
+    -8.42969f, 131.365f,  -648.051f};
+
+void PrepareInput(std::shared_ptr<Predictor> predictor) {
+  const int batch = 1;
+  const int channel = 3;
+  const int height = 318;
+  const int width = 318;
+  const int input_num = batch * channel * height * width;
+  std::vector<float> input(input_num, 1);
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  input_t->Reshape({batch, channel, height, width});
+  input_t->CopyFromCpu(input.data());
+}
+
+void CompareOutput(std::shared_ptr<Predictor> predictor) {
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  size_t out_num = std::accumulate(
+      output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+
+  float* data_o = out_data.data();
+  for (size_t j = 0; j < out_num; j += 10) {
+    EXPECT_NEAR(
+        (data_o[j] - TRUTH_VALUES[j / 10]) / TRUTH_VALUES[j / 10], 0., 10e-3);
+  }
+}
+
+TEST(xpu_config, inference) {
+  size_t l3_size = 10 * 1024 * 1024;
+  XpuConfig xpu_config;
+  xpu_config.l3_size = l3_size;
+  std::string model_dir = FLAGS_infer_model + "/" + "model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableXpu();
+  config.SetXpuConfig(xpu_config);
+
+  XpuConfig xpu_config_test = config.xpu_config();
+  CHECK_EQ(xpu_config_test.l3_size, l3_size);
+
+  auto predictor = CreatePredictor(config);
+  PrepareInput(predictor);
+  predictor->Run();
+  CompareOutput(predictor);
+}
+
+TEST(xpu_config, lite) {
+  size_t l3_size = 10 * 1024 * 1024;
+  XpuConfig xpu_config;
+  xpu_config.l3_size = l3_size;
+  std::string model_dir = FLAGS_infer_model + "/" + "model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableXpu();
+  config.SetXpuConfig(xpu_config);
+  config.EnableLiteEngine();
+
+  XpuConfig xpu_config_test = config.xpu_config();
+  CHECK_EQ(xpu_config_test.l3_size, l3_size);
+
+  auto predictor = CreatePredictor(config);
+  PrepareInput(predictor);
+  predictor->Run();
+  CompareOutput(predictor);
+}
+
+}  // namespace paddle_infer
diff --git a/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc b/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
index 88989847411..f1a0cef01cb 100644
--- a/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
+++ b/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
@@ -63,7 +63,7 @@ void CompareOutput(std::shared_ptr<Predictor> predictor) {
   }
 }
 
-Config XpuConfig() {
+Config InferXpuConfig() {
   std::string model_dir = FLAGS_infer_model + "/" + "model";
   Config config;
   config.SetModel(model_dir + "/model", model_dir + "/params");
@@ -72,7 +72,7 @@ Config XpuConfig() {
 }
 
 TEST(resnet50_xpu, basic) {
-  Config config = XpuConfig();
+  Config config = InferXpuConfig();
   auto predictor = CreatePredictor(config);
   PrepareInput(predictor);
   predictor->Run();
@@ -80,7 +80,7 @@ TEST(resnet50_xpu, basic) {
 }
 
 #define RUN_WITH_RUNTIME_CONFIG(idx_, config_)                             \
-  Config config##idx_ = XpuConfig();                                       \
+  Config config##idx_ = InferXpuConfig();                                  \
   auto predictor##idx_ = CreatePredictor(config##idx_);                    \
   PrepareInput(predictor##idx_);                                           \
   experimental::InternalUtils::RunWithRuntimeConfig(predictor##idx_.get(), \
-- 
GitLab