refine xpu inference api (#54342)

b62b384b · zhupengyang · GitHub · 8f65f72e · b62b384b · b62b384b
18 changed file
--- a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc
@@ -367,8 +367,10 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
                                                 with_time_step,
                                                 with_seq_lengths,
                                                 with_src_mask);
-  int quant_weight_bits =
+  int quant_post_dynamic_weight_precision =
-      Has("quant_weight_bits") ? Get<int>("quant_weight_bits") : -1;
+      Has("quant_post_dynamic_weight_precision ")
+          ? Get<int>("quant_post_dynamic_weight_precision ")
+          : -1;
  int found_subgraph_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@@ -421,7 +423,7 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
            w_node,
            nullptr,
            platform::errors::Fatal("w node should not be nullptr"));
-        if (quant_weight_bits == 8) {
+        if (quant_post_dynamic_weight_precision == 0) {
          PrepareWeight<int8_t>(
              graph, scope, block, w_node, &w_intx, &w_max, need_transpose);
        } else {

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -93,6 +93,25 @@ struct Argument {
 private:                                                                \
  type__ field__##_;
+#define DECL_POINTER_ARGUMENT_FIELD(field__, Field, type__)              \
+ public:                                                                 \
+  type__& field__() {                                                    \
+    PADDLE_ENFORCE_EQ(                                                   \
+        Has(#field__),                                                   \
+        true,                                                            \
+        platform::errors::PreconditionNotMet("There is no such field")); \
+    return field__##_;                                                   \
+  }                                                                      \
+  void Set##Field(type__ x) {                                            \
+    field__##_ = x;                                                      \
+    valid_fields_.insert(#field__);                                      \
+  }                                                                      \
+  DECL_ARGUMENT_FIELD_VALID(field__);                                    \
+  type__* field__##_ptr() { return &field__##_; }                        \
+                                                                         \
+ private:                                                                \
+  type__ field__##_;
 #define DECL_ARGUMENT_FIELD_VALID(field__) \
  bool field__##_valid() { return Has(#field__); }
@@ -276,20 +295,48 @@ struct Argument {
  DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);
  DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
-  DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
  DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool);
-  DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool);
-  DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string);
  DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
-  DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool);
-  DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
  DECL_ARGUMENT_FIELD(xpu_enable_multi_stream, XpuEnableMultiStream, bool);
-  DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_bits,
+  // XpuConfig
-                      XpuQuantPostDynamicWeightBits,
+  DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
+  DECL_ARGUMENT_FIELD(xpu_l3_size, XpuL3Size, size_t);
+  DECL_POINTER_ARGUMENT_FIELD(xpu_l3_ptr, XpuL3Ptr, void*);
+  DECL_ARGUMENT_FIELD(xpu_l3_autotune_size, XpuL3AutotuneSize, size_t);
+  DECL_POINTER_ARGUMENT_FIELD(xpu_stream, XpuStream, void*);
+  DECL_ARGUMENT_FIELD(xpu_conv_autotune_level, XpuConvAutotuneLevel, int);
+  DECL_ARGUMENT_FIELD(xpu_conv_autotune_file, XpuConvAutotuneFile, std::string);
+  DECL_ARGUMENT_FIELD(xpu_conv_autotune_file_writeback,
+                      XpuConvAutotuneFileWriteback,
+                      bool);
+  DECL_ARGUMENT_FIELD(xpu_fc_autotune_level, XpuFcAutotuneLevel, int);
+  DECL_ARGUMENT_FIELD(xpu_fc_autotune_file, XpuFcAutotuneFile, std::string);
+  DECL_ARGUMENT_FIELD(xpu_fc_autotune_file_writeback,
+                      XpuFcAutotuneFileWriteback,
+                      bool);
+  DECL_ARGUMENT_FIELD(xpu_gemm_compute_precision, XpuGemmComputePrecision, int);
+  DECL_ARGUMENT_FIELD(xpu_transformer_softmax_optimize_level,
+                      XpuTransformerSoftmaxOptimizeLevel,
+                      int);
+  DECL_ARGUMENT_FIELD(xpu_transformer_encoder_adaptive_seqlen,
+                      XpuTransformerEncoderAdaptiveSeqlen,
+                      bool);
+  DECL_ARGUMENT_FIELD(xpu_quant_post_static_gelu_out_threshold,
+                      XpuQuantPostStaticGeluOutThreshold,
+                      float);
+  DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_activation_method,
+                      XpuQuantPostDynamicActivationMethod,
+                      int);
+  DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_precision,
+                      XpuQuantPostDynamicWeightPrecision,
                      int);
  DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_op_types,
                      XpuQuantPostDynamicOpTypes,
                      std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(xpu_lite_l3_locked, XpuLiteL3Locked, bool);
+  DECL_ARGUMENT_FIELD(xpu_lite_enable_multi_stream,
+                      XpuLiteEnableMultiStream,
+                      bool);
  DECL_ARGUMENT_FIELD(use_opencl, UseOpenCL, bool);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -267,20 +267,41 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("enable_int8", new bool(lite_enable_int8));
      pass->Set("use_gpu", new bool(argument->use_gpu()));
      pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
-      pass->Set("xpu_l3_workspace_size",
+      pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
-                new int(argument->xpu_l3_workspace_size()));
+      pass->Set("xpu_l3_size", new size_t(argument->xpu_l3_size()));
+      pass->Set("xpu_l3_ptr", new void *(argument->xpu_l3_ptr()));
+      pass->Set("xpu_l3_autotune_size",
+                new size_t(argument->xpu_l3_autotune_size()));
+      pass->Set("xpu_stream", new void *(argument->xpu_stream()));
+      pass->Set("xpu_conv_autotune_level",
+                new int(argument->xpu_conv_autotune_level()));
+      pass->Set("xpu_conv_autotune_file",
+                new std::string(argument->xpu_conv_autotune_file()));
+      pass->Set("xpu_conv_autotune_file_writeback",
+                new bool(argument->xpu_conv_autotune_file_writeback()));
+      pass->Set("xpu_fc_autotune_level",
+                new int(argument->xpu_fc_autotune_level()));
+      pass->Set("xpu_fc_autotune_file",
+                new std::string(argument->xpu_fc_autotune_file()));
+      pass->Set("xpu_fc_autotune_file_writeback",
+                new bool(argument->xpu_fc_autotune_file_writeback()));
+      pass->Set("xpu_gemm_compute_precision",
+                new int(argument->xpu_gemm_compute_precision()));
+      pass->Set("xpu_transformer_softmax_optimize_level",
+                new int(argument->xpu_transformer_softmax_optimize_level()));
+      pass->Set("xpu_transformer_encoder_adaptive_seqlen",
+                new bool(argument->xpu_transformer_encoder_adaptive_seqlen()));
+      pass->Set(
+          "xpu_quant_post_static_gelu_out_threshold",
+          new float(argument->xpu_quant_post_static_gelu_out_threshold()));
+      pass->Set("xpu_quant_post_dynamic_activation_method",
+                new int(argument->xpu_quant_post_dynamic_activation_method()));
+      pass->Set("xpu_l3_locked", new bool(argument->xpu_lite_l3_locked()));
+      pass->Set("xpu_enable_multi_stream",
+                new bool(argument->xpu_lite_enable_multi_stream()));
      pass->Set("use_opencl", new bool(argument->use_opencl()));
      pass->Set("cpu_math_library_num_threads",
                new int(argument->cpu_math_library_num_threads()));
-      pass->Set("locked", new bool(argument->xpu_locked()));
-      pass->Set("autotune", new bool(argument->xpu_autotune()));
-      pass->Set("autotune_file",
-                new std::string(argument->xpu_autotune_file()));
-      pass->Set("precision", new std::string(argument->xpu_precision()));
-      pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen()));
-      pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
-      pass->Set("enable_multi_stream",
-                new bool(argument->xpu_enable_multi_stream()));
      // NNAdapter Related
      pass->Set("use_nnadapter", new bool(argument->use_nnadapter()));
      pass->Set("nnadapter_model_cache_dir",
@@ -313,12 +334,10 @@ void IRPassManager::CreatePasses(Argument *argument,
      bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
      pass->Set("use_fc_padding", new bool(use_fc_padding));
    } else if (pass_name == "fused_multi_transformer_xpu_pass") {
-      auto op_types = argument->xpu_quant_post_dynamic_op_types();
+      int quant_post_dynamic_weight_precision =
-      if (std::count(op_types.begin(),
+          argument->xpu_quant_post_dynamic_weight_precision();
-                     op_types.end(),
+      if (quant_post_dynamic_weight_precision == 0) {
-                     "fused_multi_transformer") > 0) {
+        pass->Set("quant_post_dynamic_weight_precision ", new int(0));
-        pass->Set("quant_weight_bits",
-                  new int(argument->xpu_quant_post_dynamic_weight_bits()));
      }
    }
    pre_pass = pass_name;

--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -249,17 +249,27 @@ void LiteSubgraphPass::SetUpEngine(
  bool use_gpu = Get<bool>("use_gpu");
  bool enable_int8 = Get<bool>("enable_int8");
-  bool use_xpu = Get<bool>("use_xpu");
-  int xpu_device_id = Get<int>("xpu_device_id");
-  int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
  bool use_opencl = Get<bool>("use_opencl");
  int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
-  bool locked = Get<bool>("locked");
+  bool use_xpu = Get<bool>("use_xpu");
-  bool autotune = Get<bool>("autotune");
+  int xpu_device_id = Get<int>("xpu_device_id");
-  std::string autotune_file = Get<std::string>("autotune_file");
+  size_t xpu_l3_size = Get<size_t>("xpu_l3_size");
-  std::string precision = Get<std::string>("precision");
+  bool xpu_l3_locked = Get<bool>("xpu_l3_locked");
-  bool adaptive_seqlen = Get<bool>("adaptive_seqlen");
+  bool xpu_conv_autotune = Get<int>("xpu_conv_autotune_level") > 0;
-  bool enable_multi_stream = Get<bool>("enable_multi_stream");
+  std::string xpu_conv_autotune_file =
+      Get<std::string>("xpu_conv_autotune_file");
+  int xpu_gemm_compute_precision = Get<int>("xpu_gemm_compute_precision");
+  std::string xpu_transformer_encoder_precision{"int16"};
+  if (xpu_gemm_compute_precision == 0) {
+    xpu_transformer_encoder_precision = "int8";
+  } else if (xpu_gemm_compute_precision == 1) {
+    xpu_transformer_encoder_precision = "int16";
+  } else if (xpu_gemm_compute_precision == 2) {
+    xpu_transformer_encoder_precision = "int31";
+  }
+  bool xpu_transformer_encoder_adaptive_seqlen =
+      Get<bool>("xpu_transformer_encoder_adaptive_seqlen");
+  bool xpu_enable_multi_stream = Get<bool>("xpu_enable_multi_stream");
  // NNAdapter Related
  bool use_nnadapter = Get<bool>("use_nnadapter");
  std::string nnadapter_model_cache_dir =
@@ -344,14 +354,15 @@ void LiteSubgraphPass::SetUpEngine(
  }
  config.cpu_math_library_num_threads = cpu_math_library_num_threads;
-  config.xpu_l3_workspace_size = xpu_l3_workspace_size;
+  config.xpu_l3_size = xpu_l3_size;
  config.device_id = xpu_device_id;
-  config.locked = locked;
+  config.xpu_l3_locked = xpu_l3_locked;
-  config.autotune = autotune;
+  config.xpu_conv_autotune = xpu_conv_autotune;
-  config.autotune_file = autotune_file;
+  config.xpu_conv_autotune_file = xpu_conv_autotune_file;
-  config.precision = precision;
+  config.xpu_transformer_encoder_precision = xpu_transformer_encoder_precision;
-  config.adaptive_seqlen = adaptive_seqlen;
+  config.xpu_transformer_encoder_adaptive_seqlen =
-  config.enable_multi_stream = enable_multi_stream;
+      xpu_transformer_encoder_adaptive_seqlen;
+  config.xpu_enable_multi_stream = xpu_enable_multi_stream;
  // NNAdapter Related
  config.nnadapter_model_cache_dir = nnadapter_model_cache_dir;
  config.nnadapter_device_names = nnadapter_device_names;

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -172,22 +172,34 @@ void AnalysisConfig::DisableFCPadding() {
  Update();
 }
-void AnalysisConfig::EnableXpu(int l3_workspace_size,
+void AnalysisConfig::EnableXpu(int l3_size,
-                               bool locked,
+                               bool l3_locked,
-                               bool autotune,
+                               bool conv_autotune,
-                               const std::string &autotune_file,
+                               const std::string &conv_autotune_file,
-                               const std::string &precision,
+                               const std::string &transformer_encoder_precision,
-                               bool adaptive_seqlen,
+                               bool transformer_encoder_adaptive_seqlen,
                               bool enable_multi_stream) {
+#ifdef PADDLE_WITH_XPU
  use_xpu_ = true;
-  xpu_l3_workspace_size_ = l3_workspace_size;
+  xpu_config_.l3_size = l3_size;
-  xpu_locked_ = locked;
+  xpu_config_.conv_autotune_level = conv_autotune;
-  xpu_autotune_ = autotune;
+  xpu_config_.conv_autotune_file = conv_autotune_file;
-  xpu_autotune_file_ = autotune_file;
+  if (transformer_encoder_precision == "int8") {
-  xpu_precision_ = precision;
+    xpu_config_.gemm_compute_precision = 0;
-  xpu_adaptive_seqlen_ = adaptive_seqlen;
+  } else if (transformer_encoder_precision == "int16") {
-  xpu_enable_multi_stream_ = enable_multi_stream;
+    xpu_config_.gemm_compute_precision = 1;
+  } else if (transformer_encoder_precision == "int31") {
+    xpu_config_.gemm_compute_precision = 2;
+  }
+  xpu_config_.transformer_encoder_adaptive_seqlen =
+      transformer_encoder_adaptive_seqlen;
+  xpu_lite_l3_locked_ = l3_locked;
+  xpu_lite_enable_multi_stream_ = enable_multi_stream;
  Update();
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "To use XPU inference, please compile with option 'WITH_XPU' first."));
+#endif
 }
 void AnalysisConfig::SetXpuDeviceId(int device_id) {
@@ -195,15 +207,22 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) {
                    true,
                    platform::errors::PreconditionNotMet(
                        "Should call EnableXpu before SetXpuDeviceId."));
-  xpu_device_id_ = device_id;
+  xpu_config_.device_id = device_id;
  Update();
 }
-void AnalysisConfig::SetXpuConfig(
+void AnalysisConfig::SetXpuConfig(const XpuConfig &config) {
-    int quant_post_dynamic_weight_bits,
+  PADDLE_ENFORCE(use_xpu_,
-    const std::vector<std::string> &quant_post_dynamic_op_types) {
+                 platform::errors::PreconditionNotMet(
-  xpu_quant_post_dynamic_weight_bits_ = quant_post_dynamic_weight_bits;
+                     "Should call EnableXpu before SetXpuConfig."));
-  xpu_quant_post_dynamic_op_types_ = quant_post_dynamic_op_types;
+  PADDLE_ENFORCE_LE(
+      config.l3_autotune_size,
+      config.l3_size,
+      phi::errors::InvalidArgument(
+          "l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
+          config.l3_autotune_size,
+          config.l3_size));
+  xpu_config_ = config;
  Update();
 }
@@ -494,16 +513,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  // XPU related.
  CP_MEMBER(use_xpu_);
-  CP_MEMBER(xpu_device_id_);
+  CP_MEMBER(xpu_config_);
-  CP_MEMBER(xpu_l3_workspace_size_);
+  CP_MEMBER(xpu_lite_l3_locked_);
-  CP_MEMBER(xpu_locked_);
+  CP_MEMBER(xpu_lite_enable_multi_stream_);
-  CP_MEMBER(xpu_autotune_);
-  CP_MEMBER(xpu_autotune_file_);
-  CP_MEMBER(xpu_precision_);
-  CP_MEMBER(xpu_adaptive_seqlen_);
-  CP_MEMBER(xpu_enable_multi_stream_);
-  CP_MEMBER(xpu_quant_post_dynamic_weight_bits_);
-  CP_MEMBER(xpu_quant_post_dynamic_op_types_);
  // Lite OpenCL Related
  CP_MEMBER(use_opencl_);
@@ -1033,7 +1045,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << exec_stream_;
  ss << use_fc_padding_;
  ss << gpu_device_id_;
-  ss << xpu_device_id_;
  ss << memory_pool_init_size_mb_;
  ss << use_tensorrt_;
@@ -1080,17 +1091,26 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << use_lite_;
  ss << use_xpu_;
-  ss << xpu_l3_workspace_size_;
+  ss << xpu_config_.device_id;
-  ss << xpu_locked_;
+  ss << xpu_config_.l3_size;
-  ss << xpu_autotune_;
+  ss << xpu_config_.l3_ptr;
-  ss << xpu_autotune_file_;
+  ss << xpu_config_.l3_autotune_size;
-  ss << xpu_precision_;
+  ss << xpu_config_.stream;
-  ss << xpu_adaptive_seqlen_;
+  ss << xpu_config_.conv_autotune_level;
-  ss << xpu_enable_multi_stream_;
+  ss << xpu_config_.conv_autotune_file;
-  ss << xpu_quant_post_dynamic_weight_bits_;
+  ss << xpu_config_.conv_autotune_file_writeback;
-  for (auto op_type : xpu_quant_post_dynamic_op_types_) {
+  ss << xpu_config_.fc_autotune_level;
-    ss << op_type;
+  ss << xpu_config_.fc_autotune_file;
-  }
+  ss << xpu_config_.fc_autotune_file_writeback;
+  ss << xpu_config_.gemm_compute_precision;
+  ss << xpu_config_.transformer_softmax_optimize_level;
+  ss << xpu_config_.transformer_encoder_adaptive_seqlen;
+  ss << xpu_config_.quant_post_static_gelu_out_threshold;
+  ss << xpu_config_.quant_post_dynamic_activation_method;
+  ss << xpu_config_.quant_post_dynamic_weight_precision;
+  for (auto type : xpu_config_.quant_post_dynamic_op_types) ss << type;
+  ss << xpu_lite_l3_locked_;
+  ss << xpu_lite_enable_multi_stream_;
  ss << thread_local_stream_;
@@ -1318,16 +1338,49 @@ std::string AnalysisConfig::Summary() {
  // xpu info
  os.InsertRow({"use_xpu", use_xpu_ ? "true" : "false"});
  if (use_xpu_) {
-    os.InsertRow({"xpu_device_id", std::to_string(xpu_device_id_)});
+    os.InsertRow({"xpu_device_id", std::to_string(xpu_config_.device_id)});
+    os.InsertRow({"xpu_l3_size", std::to_string(xpu_config_.l3_size)});
    os.InsertRow(
-        {"xpu_l3_workspace_size", std::to_string(xpu_l3_workspace_size_)});
+        {"xpu_l3_ptr",
-    os.InsertRow({"xpu_quant_post_dynamic_weight_bits",
+         std::to_string(reinterpret_cast<int64_t>(xpu_config_.l3_ptr))});
-                  std::to_string(xpu_quant_post_dynamic_weight_bits_)});
+    os.InsertRow(
-    std::vector<std::string> op_types{"xpu_quant_post_dynamic_op_types"};
+        {"xpu_l3_autotune_size", std::to_string(xpu_config_.l3_autotune_size)});
-    for (auto op_type : xpu_quant_post_dynamic_op_types_) {
+    os.InsertRow(
-      op_types.push_back(op_type);
+        {"xpu_stream",
-    }
+         std::to_string(reinterpret_cast<int64_t>(xpu_config_.stream))});
-    os.InsertRow(op_types);
+    os.InsertRow({"xpu_conv_autotune_level",
+                  std::to_string(xpu_config_.conv_autotune_level)});
+    os.InsertRow({"xpu_conv_autotune_file", xpu_config_.conv_autotune_file});
+    os.InsertRow({"xpu_conv_autotune_file_writeback",
+                  std::to_string(xpu_config_.conv_autotune_file_writeback)});
+    os.InsertRow({"xpu_fc_autotune_level",
+                  std::to_string(xpu_config_.fc_autotune_level)});
+    os.InsertRow({"xpu_fc_autotune_file", xpu_config_.fc_autotune_file});
+    os.InsertRow({"xpu_fc_autotune_file_writeback",
+                  std::to_string(xpu_config_.fc_autotune_file_writeback)});
+    os.InsertRow({"xpu_gemm_compute_precision",
+                  std::to_string(xpu_config_.gemm_compute_precision)});
+    os.InsertRow(
+        {"xpu_transformer_softmax_optimize_level",
+         std::to_string(xpu_config_.transformer_softmax_optimize_level)});
+    os.InsertRow(
+        {"xpu_transformer_encoder_adaptive_seqlen",
+         std::to_string(xpu_config_.transformer_encoder_adaptive_seqlen)});
+    os.InsertRow(
+        {"xpu_quant_post_static_gelu_out_threshold",
+         std::to_string(xpu_config_.quant_post_static_gelu_out_threshold)});
+    os.InsertRow(
+        {"xpu_quant_post_dynamic_activation_method",
+         std::to_string(xpu_config_.quant_post_dynamic_activation_method)});
+    os.InsertRow(
+        {"xpu_quant_post_dynamic_weight_precision ",
+         std::to_string(xpu_config_.quant_post_dynamic_weight_precision)});
+    std::vector<std::string> quant_post_dynamic_op_types_info =
+        xpu_config_.quant_post_dynamic_op_types;
+    quant_post_dynamic_op_types_info.insert(
+        quant_post_dynamic_op_types_info.begin(),
+        "xpu_quant_post_dynamic_op_types");
+    os.InsertRow(quant_post_dynamic_op_types_info);
  }
  os.InsetDivider();

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -389,7 +389,7 @@ bool AnalysisPredictor::Init(
  }
 #endif
 #if defined(PADDLE_WITH_XPU)
-  if (config_.use_xpu_) {
+  if (config_.use_xpu_ && !config_.use_lite_) {
    private_context_ = true;
    if (!status_is_cloned_ && config_.external_stream_enabled()) {
      predictor_stream_ = config_.GetExecStream();
@@ -1418,14 +1418,8 @@ void AnalysisPredictor::PrepareArgument() {
    argument_->SetLitePassesFilter(config_.lite_passes_filter_);
    argument_->SetLiteOpsFilter(config_.lite_ops_filter_);
    argument_->SetLiteZeroCopy(config_.lite_zero_copy_);
-    argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
+    argument_->SetXpuLocked(config_.xpu_lite_l3_locked_);
-    argument_->SetXpuLocked(config_.xpu_locked_);
+    argument_->SetXpuEnableMultiStream(config_.xpu_lite_enable_multi_stream_);
-    argument_->SetXpuAutotune(config_.xpu_autotune_);
-    argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_);
-    argument_->SetXpuPrecision(config_.xpu_precision_);
-    argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
-    argument_->SetXpuDeviceId(config_.xpu_device_id_);
-    argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
    argument_->SetUseOpenCL(config_.use_opencl_);
    // NNAdapter related
    argument_->SetUseNNAdapter(config_.NNAdapter().use_nnadapter);
@@ -1506,21 +1500,36 @@ void AnalysisPredictor::PrepareArgument() {
  }
 #endif
-#ifdef PADDLE_WITH_XPU
  argument_->SetUseXpu(config_.use_xpu_);
-  argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
+  argument_->SetXpuDeviceId(config_.xpu_config_.device_id);
-  argument_->SetXpuLocked(config_.xpu_locked_);
+  argument_->SetXpuL3Size(config_.xpu_config_.l3_size);
-  argument_->SetXpuAutotune(config_.xpu_autotune_);
+  argument_->SetXpuL3Ptr(config_.xpu_config_.l3_ptr);
-  argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_);
+  argument_->SetXpuL3AutotuneSize(config_.xpu_config_.l3_autotune_size);
-  argument_->SetXpuPrecision(config_.xpu_precision_);
+  argument_->SetXpuStream(config_.xpu_config_.stream);
-  argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
+  argument_->SetXpuConvAutotuneLevel(config_.xpu_config_.conv_autotune_level);
-  argument_->SetXpuDeviceId(config_.xpu_device_id_);
+  argument_->SetXpuConvAutotuneFile(config_.xpu_config_.conv_autotune_file);
-  argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
+  argument_->SetXpuConvAutotuneFileWriteback(
-  argument_->SetXpuQuantPostDynamicWeightBits(
+      config_.xpu_config_.conv_autotune_file_writeback);
-      config_.xpu_quant_post_dynamic_weight_bits_);
+  argument_->SetXpuFcAutotuneLevel(config_.xpu_config_.fc_autotune_level);
+  argument_->SetXpuFcAutotuneFile(config_.xpu_config_.fc_autotune_file);
+  argument_->SetXpuFcAutotuneFileWriteback(
+      config_.xpu_config_.fc_autotune_file_writeback);
+  argument_->SetXpuGemmComputePrecision(
+      config_.xpu_config_.gemm_compute_precision);
+  argument_->SetXpuTransformerSoftmaxOptimizeLevel(
+      config_.xpu_config_.transformer_softmax_optimize_level);
+  argument_->SetXpuTransformerEncoderAdaptiveSeqlen(
+      config_.xpu_config_.transformer_encoder_adaptive_seqlen);
+  argument_->SetXpuQuantPostStaticGeluOutThreshold(
+      config_.xpu_config_.quant_post_static_gelu_out_threshold);
+  argument_->SetXpuQuantPostDynamicActivationMethod(
+      config_.xpu_config_.quant_post_dynamic_activation_method);
+  argument_->SetXpuQuantPostDynamicWeightPrecision(
+      config_.xpu_config_.quant_post_dynamic_weight_precision);
  argument_->SetXpuQuantPostDynamicOpTypes(
-      config_.xpu_quant_post_dynamic_op_types_);
+      config_.xpu_config_.quant_post_dynamic_op_types);
-#endif
+  argument_->SetXpuLiteL3Locked(config_.xpu_lite_l3_locked_);
+  argument_->SetXpuLiteEnableMultiStream(config_.xpu_lite_enable_multi_stream_);
  auto *pass_builder = config_.pass_builder();
  // TODO(inference): Need to reconstruct the pass_builder, pass should be
@@ -2076,9 +2085,36 @@ bool AnalysisPredictor::ZeroCopyRun() {
  }
 #endif
+#ifdef PADDLE_WITH_XPU
+  InferXPUContext *infer_xpu_ctx = nullptr;
+  if (config_.use_xpu_ && !config_.use_lite_) {
+    PADDLE_ENFORCE(
+        private_context_,
+        paddle::platform::errors::Fatal(
+            "Must use private context if run predictor on xpu place."));
+    auto *dev_ctxs = reinterpret_cast<const std::map<
+        phi::Place,
+        std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
+        this->GetDeviceContexts());
+    infer_xpu_ctx =
+        static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
+    infer_xpu_ctx->SetStream(predictor_stream_);
+    infer_xpu_ctx->SetL3Info(config_.xpu_config_.l3_size,
+                             config_.xpu_config_.l3_ptr,
+                             config_.xpu_config_.l3_autotune_size,
+                             place_);
+  }
+#endif
  executor_->Run();
  inference::DisplayMemoryInfo(place_, "after run");
+#ifdef PADDLE_WITH_XPU
+  if (config_.use_xpu_ && !config_.use_lite_ && infer_xpu_ctx != nullptr) {
+    infer_xpu_ctx->L3CacheAutotune();
+  }
+#endif
  if (config_.shape_range_info_collected()) {
    CollectShapeRangeInfo();
  }
@@ -2148,18 +2184,6 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
 bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
 #ifdef PADDLE_WITH_XPU
-  PADDLE_ENFORCE(
-      private_context_,
-      paddle::platform::errors::Fatal(
-          "Must use private context if run predictor with external config."));
-  auto *dev_ctxs = reinterpret_cast<const std::map<
-      phi::Place,
-      std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
-      this->GetDeviceContexts());
-  auto *dev_ctx =
-      static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
  auto xpu_runtime_config =
      reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
  auto *stream = xpu_runtime_config->stream;
@@ -2167,12 +2191,10 @@ bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
    paddle::platform::XPUStreamSync(
        static_cast<paddle::xpuStream>(predictor_stream_));
    predictor_stream_ = stream;
-    dev_ctx->SetStream(stream);
  }
-  size_t l3_size = xpu_runtime_config->l3_size;
+  auto l3_size = xpu_runtime_config->l3_size;
-  void *l3_ptr = xpu_runtime_config->l3_ptr;
+  auto l3_autotune_size = xpu_runtime_config->l3_autotune_size;
-  size_t l3_autotune_size = xpu_runtime_config->l3_autotune_size;
  PADDLE_ENFORCE_LE(
      l3_autotune_size,
      l3_size,
@@ -2180,11 +2202,11 @@ bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
          "l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
          l3_autotune_size,
          l3_size));
-  dev_ctx->SetL3Info(l3_size, l3_ptr, l3_autotune_size, place_);
+  config_.xpu_config_.l3_size = l3_size;
+  config_.xpu_config_.l3_ptr = xpu_runtime_config->l3_ptr;
+  config_.xpu_config_.l3_autotune_size = l3_autotune_size;
-  bool ret = ZeroCopyRun();
+  return ZeroCopyRun();
-  dev_ctx->L3CacheAutotune();
-  return ret;
 #endif
  return false;
 }

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -76,6 +76,77 @@ struct LiteNNAdapterConfig {
  LiteNNAdapterConfig& Disable();
 };
+struct PD_INFER_DECL XpuConfig {
+  // Select which xpu device to run model.
+  int device_id{0};
+  // Available l3 size (Byte)
+  // For kunlun1, max l3_size is 16773120 Byte
+  // For kunlun2, max l3_size is 67104768 Byte
+  size_t l3_size{0};
+  // If l3_ptr is not nullptr, it is used as l3 buffer.
+  // If l3_ptr is nullptr, new l3 buffer will be created.
+  void* l3_ptr{nullptr};
+  // Available l3 size for autotune.
+  // If l3_autotune_size is 0, autotune is closed.
+  // Note: The remaining l3 size (l3_size - l3_autotune_size) is for
+  // kernels (both paddle/xdnn kernels)
+  size_t l3_autotune_size{0};
+  // Stream for execution.
+  // If stream is nullptr, default stream will be used.
+  void* stream{nullptr};
+  // Conv autotune level. Default 0 means no autotune.
+  // Note: Paddle-Lite only.
+  int conv_autotune_level{0};
+  // Base conv autotune info is read from conv_autotune_file.
+  // Note: Paddle-Lite only.
+  std::string conv_autotune_file;
+  // Whether write new conv autotune info to conv_autotune_file.
+  // Note: Paddle-Lite only.
+  bool conv_autotune_file_writeback{false};
+  // Fc autotune level. The Optional values are 0-9. Default 0 means no
+  // autotune. Note: Paddle-Lite only.
+  int fc_autotune_level{0};
+  // Base fc autotune info is read from fc_autotune_file.
+  // Note: Paddle-Lite only.
+  std::string fc_autotune_file;
+  // Whether write new fc autotune info to fc_autotune_file.
+  // Note: Paddle-Lite only.
+  bool fc_autotune_file_writeback{false};
+  // Gemm compute precision. Optional values are 0(int8),1(int16),2(int31).
+  // Note: "gemm_compute_precision" has no effect on quanted ops of quant model
+  // Note: Paddle-Lite only.
+  int gemm_compute_precision{1};
+  // Which method to optimize softmax in transformer structure. Optional values
+  // are 0,1,2. Note: Paddle-Lite only.
+  int transformer_softmax_optimize_level{0};
+  // Whether enable adaptive_seqlen optimize on transformer encoder.
+  // Note: Paddle-Lite only.
+  bool transformer_encoder_adaptive_seqlen{true};
+  // Gelu out max threshold is limited to quant_post_static_gelu_out_threshold
+  // if use static post-quantization.
+  // Note: Paddle-Lite only.
+  float quant_post_static_gelu_out_threshold{10.f};
+  // Activation method if use dynamic post-quantization.
+  // For kunlun1, optional values are 0(per_tensor),1(per_batch),2(per_head).
+  // For kunlun2, optional values are 0(per_tensor) or non-zero(every_16).
+  // Note: Paddle-Lite only.
+  int quant_post_dynamic_activation_method{0};
+  // Preprocess weight to quant_post_dynamic_weight_precision if use dynamic
+  // post-quantization. Optional values is 0,1,2.
+  // * If 0, preprocess weight to int8.
+  // * If 1, preprocess weight to int16.
+  // * If 2, preprocess weight to float.
+  // Note: PaddleInference only.
+  int quant_post_dynamic_weight_precision{1};
+  std::vector<std::string> quant_post_dynamic_op_types;
+};
 struct DistConfig {
  bool use_dist_model() const { return use_dist_model_; }
  void EnableDistModel(bool use_dist_model) {
@@ -271,42 +342,46 @@ struct PD_INFER_DECL AnalysisConfig {
  /// \brief Turn on XPU.
  ///
  /// \param l3_workspace_size The size of the video memory allocated by the l3
-  ///         cache, the maximum is 16M.
+  ///       cache, the maximum is 16M.
-  /// \param locked Whether the allocated L3 cache can be locked. If false,
+  /// \param l3_locked Whether the allocated L3 cache can be locked. If false,
  ///       it means that the L3 cache is not locked, and the allocated L3
  ///       cache can be shared by multiple models, and multiple models
  ///       sharing the L3 cache will be executed sequentially on the card.
-  /// \param autotune Whether to autotune the conv operator in the model. If
+  /// \param conv_autotune Whether to autotune the conv operator in the model.
-  ///       true, when the conv operator of a certain dimension is executed
+  ///       If true, when the conv operator of a certain dimension is executed
  ///       for the first time, it will automatically search for a better
  ///       algorithm to improve the performance of subsequent conv operators
  ///       of the same dimension.
-  /// \param autotune_file Specify the path of the autotune file. If
+  /// \param conv_autotune_file Specify the path of the autotune file. If
  ///       autotune_file is specified, the algorithm specified in the
  ///       file will be used and autotune will not be performed again.
-  /// \param precision Calculation accuracy of multi_encoder
+  /// \param transformer_encoder_precision Calculation accuracy of multi_encoder
-  /// \param adaptive_seqlen Is the input of multi_encoder variable length
+  /// \param transformer_encoder_adaptive_seqlen Is the input of multi_encoder
-  /// \param enable_multi_stream Whether to enable the multi stream of xpu.
+  ///       variable length
-  ///
+  /// \param enable_multi_stream Whether to enable the multi
-  void EnableXpu(int l3_workspace_size = 0xfffc00,
+  ///       stream of xpu.
-                 bool locked = false,
+  ///
-                 bool autotune = true,
+  void EnableXpu(int l3_size = 0xfffc00,
-                 const std::string& autotune_file = "",
+                 bool l3_locked = false,
-                 const std::string& precision = "int16",
+                 bool conv_autotune = true,
-                 bool adaptive_seqlen = false,
+                 const std::string& conv_autotune_file = "",
+                 const std::string& transformer_encoder_precision = "int16",
+                 bool transformer_encoder_adaptive_seqlen = false,
                 bool enable_multi_stream = false);
  ///
  /// \brief configs of XPU
  ///
-  /// \param quant_post_dynamic_weight_bits Weight bits used in dynamic post
+  /// \param config Configs for xpu. See XpuConfig for more details.
-  /// quantization. Optional value: -1, 8, 16. Default value is -1, means using
+  ///
-  /// the recommended way. \param quant_post_dynamic_op_types Ops used in
+  void SetXpuConfig(const XpuConfig& config);
-  /// dynamic post quantization.
+  ///
+  /// \brief Get configs of xpu
+  ///
+  /// \return XpuConfig The configs of xpu.
  ///
-  void SetXpuConfig(
+  XpuConfig xpu_config() { return xpu_config_; }
-      int quant_post_dynamic_weight_bits = -1,
-      const std::vector<std::string>& quant_post_dynamic_op_types = {});
  ///
  /// \brief configs of IPU
@@ -462,7 +537,7 @@ struct PD_INFER_DECL AnalysisConfig {
  ///
  /// \return int The XPU device id.
  ///
-  int xpu_device_id() const { return xpu_device_id_; }
+  int xpu_device_id() const { return xpu_config_.device_id; }
  /// \brief Get the number of IPU device .
  ///
  /// \return int The number of IPU device.
@@ -1191,16 +1266,9 @@ struct PD_INFER_DECL AnalysisConfig {
  // XPU related.
  bool use_xpu_{false};
-  int xpu_device_id_{0};
+  XpuConfig xpu_config_;
-  int xpu_l3_workspace_size_{0};
+  bool xpu_lite_l3_locked_{false};
-  bool xpu_locked_;
+  bool xpu_lite_enable_multi_stream_{false};
-  bool xpu_autotune_;
-  std::string xpu_autotune_file_;
-  std::string xpu_precision_;
-  bool xpu_adaptive_seqlen_;
-  bool xpu_enable_multi_stream_;
-  int xpu_quant_post_dynamic_weight_bits_{-1};
-  std::vector<std::string> xpu_quant_post_dynamic_op_types_;
  // LITE OPENCL SETTINGS
  bool use_opencl_{false};

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -47,6 +47,7 @@ namespace paddle_infer {
 using PrecisionType = paddle::AnalysisConfig::Precision;
 using Config = paddle::AnalysisConfig;
 using DistConfig = paddle::DistConfig;
+using XpuConfig = paddle::XpuConfig;
 ///
 /// \class Predictor

--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -154,20 +154,20 @@ void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) {
 }
 void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
-                        int32_t l3_workspace_size,
+                        int32_t l3_size,
-                        PD_Bool locked,
+                        PD_Bool l3_locked,
-                        PD_Bool autotune,
+                        PD_Bool conv_autotune,
-                        const char* autotune_file,
+                        const char* conv_autotune_file,
-                        const char* precision,
+                        const char* transformer_encoder_precision,
-                        PD_Bool adaptive_seqlen,
+                        PD_Bool transformer_encoder_adaptive_seqlen,
                        PD_Bool enable_multi_stream) {
  CHECK_AND_CONVERT_PD_CONFIG;
-  config->EnableXpu(l3_workspace_size,
+  config->EnableXpu(l3_size,
-                    locked,
+                    l3_locked,
-                    autotune,
+                    conv_autotune,
-                    autotune_file,
+                    conv_autotune_file,
-                    precision,
+                    transformer_encoder_precision,
-                    adaptive_seqlen,
+                    transformer_encoder_adaptive_seqlen,
                    enable_multi_stream);
 }

--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -193,23 +193,22 @@ func (config *Config) EnableORTOptimization() {
 ///
 /// \brief Turn on XPU.
 ///
-/// \param l3_workspace_size The size of the video memory allocated by the l3 cache, the maximum is 16M.
+/// \param l3Size The size of the video memory allocated by the l3 cache, the maximum is 16M.
-/// \param locked Whether the allocated L3 cache can be locked. If false, it means that the L3 cache is not locked, and the allocated L3 cache can be shared by multiple models, and multiple models sharing the L3 cache will be executed sequentially on the card.
+/// \param l3Locked Whether the allocated L3 cache can be locked. If false, it means that the L3 cache is not locked, and the allocated L3 cache can be shared by multiple models, and multiple models sharing the L3 cache will be executed sequentially on the card.
-/// \param autotune Whether to autotune the conv operator in the model. If true, when the conv operator of a certain dimension is executed for the first time, it will automatically search for a better algorithm to improve the performance of subsequent conv operators of the same dimension.
+/// \param convAutotune Whether to autotune the conv operator in the model. If true, when the conv operator of a certain dimension is executed for the first time, it will automatically search for a better algorithm to improve the performance of subsequent conv operators of the same dimension.
-/// \param autotune_file Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again.
+/// \param convAutotuneFile Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again.
-/// \param precision Calculation accuracy of multi_encoder
+/// \param transformerEencoderPrecision Calculation accuracy of multi_encoder
-/// \param adaptive_seqlen Is the input of multi_encoder variable length
+/// \param transformerEncoderAdaptiveSeqlen Is the input of multi_encoder variable length
 /// \param enable_multi_stream Whether to enable the multi stream of xpu
 ///
-func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune bool, autotuneFile string, precision string, adaptiveSeqlen bool, enableMultiStream bool) {
+func (config *Config) EnableXpu(l3Size int32, l3Locked bool, convAutotune bool, convAutotuneFile string, transformerEencoderPrecision string, transformerEncoderAdaptiveSeqlen bool, enableMultiStream bool) {
-	cAutotuneFile := C.CString(autotuneFile)
+	cConvAutotuneFile := C.CString(convAutotuneFile)
-	cPrecision := C.CString(precision)
+	cTransformerEencoderPrecision := C.CString(transformerEencoderPrecision)
 	defer func() {
-		C.free(unsafe.Pointer(cAutotuneFile))
+		C.free(unsafe.Pointer(cConvAutotuneFile))
-		C.free(unsafe.Pointer(cPrecision))
+		C.free(unsafe.Pointer(cTransformerEencoderPrecision))
 	}()
-	C.PD_ConfigEnableXpu(config.c, C.int32_t(l3WorkspaceSize), cvtGoBoolToPD(locked), cvtGoBoolToPD(autotune),
+	C.PD_ConfigEnableXpu(config.c, C.int32_t(l3Size), cvtGoBoolToPD(l3Locked), cvtGoBoolToPD(convAutotune), cConvAutotuneFile, cTransformerEencoderPrecision, cvtGoBoolToPD(transformerEncoderAdaptiveSeqlen), cvtGoBoolToPD(enableMultiStream))
-		cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen), cvtGoBoolToPD(enableMultiStream))
 }
 ///

--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -56,16 +56,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
 #endif
 #ifdef LITE_SUBGRAPH_WITH_XPU
-  // Deprecated in Paddle-Lite release/v2.8
+  lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_size, cfg.xpu_l3_locked);
-  lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
+  lite_cxx_config.set_xpu_conv_autotune(cfg.xpu_conv_autotune,
-      cfg.xpu_l3_workspace_size);
+                                        cfg.xpu_conv_autotune_file);
-  lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_workspace_size,
+  lite_cxx_config.set_xpu_multi_encoder_method(
-                                          cfg.locked);
+      cfg.xpu_transformer_encoder_precision,
-  lite_cxx_config.set_xpu_conv_autotune(cfg.autotune, cfg.autotune_file);
+      cfg.xpu_transformer_encoder_adaptive_seqlen);
-  lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision,
-                                               cfg.adaptive_seqlen);
  lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id);
-  if (cfg.enable_multi_stream) {
+  if (cfg.xpu_enable_multi_stream) {
    lite_cxx_config.enable_xpu_multi_stream();
  }
 #endif

--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -41,13 +41,13 @@ struct EngineConfig {
  int device_id = 0;
  // for xpu
-  size_t xpu_l3_workspace_size;
+  size_t xpu_l3_size{0};
-  bool locked = false;
+  bool xpu_l3_locked = false;
-  bool autotune = true;
+  bool xpu_conv_autotune = true;
-  std::string autotune_file = "";
+  std::string xpu_conv_autotune_file = "";
-  std::string precision = "int16";
+  std::string xpu_transformer_encoder_precision = "int16";
-  bool adaptive_seqlen = false;
+  bool xpu_transformer_encoder_adaptive_seqlen = false;
-  bool enable_multi_stream = false;
+  bool xpu_enable_multi_stream = false;
  // for x86 or arm
  int cpu_math_library_num_threads{1};

--- a/paddle/fluid/inference/paddle_inference.map
+++ b/paddle/fluid/inference/paddle_inference.map
@@ -21,6 +21,7 @@
 			*paddle::internal*;
 			*paddle::get_version*;
 			*paddle::LiteNNAdapterConfig*;
+			*paddle::XpuConfig*;
 			*paddle::AnalysisConfig::*;
 			*paddle::PaddlePredictor::*;
 			*paddle::CreatePaddlePredictor*;

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -108,6 +108,7 @@ void BindPaddlePredictor(py::module *m);
 void BindNativeConfig(py::module *m);
 void BindNativePredictor(py::module *m);
 void BindLiteNNAdapterConfig(py::module *m);
+void BindXpuConfig(py::module *m);
 void BindAnalysisConfig(py::module *m);
 void BindAnalysisPredictor(py::module *m);
 void BindZeroCopyTensor(py::module *m);
@@ -476,6 +477,7 @@ void BindInferenceApi(py::module *m) {
  BindNativeConfig(m);
  BindNativePredictor(m);
  BindLiteNNAdapterConfig(m);
+  BindXpuConfig(m);
  BindAnalysisConfig(m);
  BindAnalysisPredictor(m);
  BindPaddleInferPredictor(m);
@@ -756,21 +758,21 @@ void BindAnalysisConfig(py::module *m) {
 #endif
      .def("enable_xpu",
           &AnalysisConfig::EnableXpu,
-           py::arg("l3_workspace_size") = 16 * 1024 * 1024,
+           py::arg("l3_size") = 16 * 1024 * 1024,
-           py::arg("locked") = false,
+           py::arg("l3_locked") = false,
-           py::arg("autotune") = true,
+           py::arg("conv_autotune") = true,
-           py::arg("autotune_file") = "",
+           py::arg("conv_autotune_file") = "",
-           py::arg("precision") = "int16",
+           py::arg("transformer_encoder_precision") = "int16",
-           py::arg("adaptive_seqlen") = false,
+           py::arg("transformer_encoder_adaptive_seqlen") = false,
           py::arg("enable_multi_stream") = false)
      .def("set_xpu_device_id",
           &AnalysisConfig::SetXpuDeviceId,
           py::arg("device_id") = 0)
-      .def(
+      .def("set_xpu_config",
-          "set_xpu_config",
+           [](AnalysisConfig &self, const paddle_infer::XpuConfig &xpu_config) {
-          &AnalysisConfig::SetXpuConfig,
+             self.SetXpuConfig(xpu_config);
-          py::arg("quant_post_dynamic_weight_bits") = -1,
+           })
-          py::arg("quant_post_dynamic_op_types") = std::vector<std::string>({}))
+      .def("xpu_config", &AnalysisConfig::xpu_config)
      .def("enable_custom_device",
           &AnalysisConfig::EnableCustomDevice,
           py::arg("device_type"),
@@ -1000,6 +1002,38 @@ void BindLiteNNAdapterConfig(py::module *m) {
      .def("disable", &LiteNNAdapterConfig::Disable);
 }
+void BindXpuConfig(py::module *m) {
+  py::class_<XpuConfig>(*m, "XpuConfig")
+      .def(py::init<>())
+      .def_readwrite("device_id", &XpuConfig::device_id)
+      .def_readwrite("l3_ptr", &XpuConfig::l3_ptr)
+      .def_readwrite("l3_size", &XpuConfig::l3_size)
+      .def_readwrite("l3_autotune_size", &XpuConfig::l3_autotune_size)
+      .def_readwrite("stream", &XpuConfig::stream)
+      .def_readwrite("conv_autotune_level", &XpuConfig::conv_autotune_level)
+      .def_readwrite("conv_autotune_file", &XpuConfig::conv_autotune_file)
+      .def_readwrite("conv_autotune_file_writeback",
+                     &XpuConfig::conv_autotune_file_writeback)
+      .def_readwrite("fc_autotune_level", &XpuConfig::fc_autotune_level)
+      .def_readwrite("fc_autotune_file", &XpuConfig::fc_autotune_file)
+      .def_readwrite("fc_autotune_file_writeback",
+                     &XpuConfig::fc_autotune_file_writeback)
+      .def_readwrite("gemm_compute_precision",
+                     &XpuConfig::gemm_compute_precision)
+      .def_readwrite("transformer_softmax_optimize_level",
+                     &XpuConfig::transformer_softmax_optimize_level)
+      .def_readwrite("transformer_encoder_adaptive_seqlen",
+                     &XpuConfig::transformer_encoder_adaptive_seqlen)
+      .def_readwrite("quant_post_static_gelu_out_threshold",
+                     &XpuConfig::quant_post_static_gelu_out_threshold)
+      .def_readwrite("quant_post_dynamic_activation_method",
+                     &XpuConfig::quant_post_dynamic_activation_method)
+      .def_readwrite("quant_post_dynamic_weight_precision",
+                     &XpuConfig::quant_post_dynamic_weight_precision)
+      .def_readwrite("quant_post_dynamic_op_types",
+                     &XpuConfig::quant_post_dynamic_op_types);
+}
 #ifdef PADDLE_WITH_MKLDNN
 void BindMkldnnQuantizerConfig(py::module *m) {
  py::class_<MkldnnQuantizerConfig> quantizer_config(*m,

--- a/python/paddle/inference/__init__.py
+++ b/python/paddle/inference/__init__.py
@@ -30,6 +30,7 @@ from paddle.fluid.core import (
    get_trt_runtime_version,
    get_num_bytes_of_data_type,
    PredictorPool,
+    XpuConfig,
 )
 __all__ = [  # noqa
@@ -47,4 +48,5 @@ __all__ = [  # noqa
    'get_trt_runtime_version',
    'get_num_bytes_of_data_type',
    'PredictorPool',
+    'XpuConfig',
 ]
--- a/test/cpp/inference/api/CMakeLists.txt
+++ b/test/cpp/inference/api/CMakeLists.txt
@@ -1476,6 +1476,15 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
  endif()
  if(WITH_XPU)
+    inference_analysis_test(
+      xpu_config_resnet50_test
+      SRCS
+      xpu_config_resnet50_test.cc
+      EXTRA_DEPS
+      paddle_inference_shared
+      python
+      ARGS
+      --infer_model=${RESNET50_MODEL_DIR})
    inference_analysis_test(
      xpu_runtime_config_resnet50_test
      SRCS

--- a/test/cpp/inference/api/xpu_config_resnet50_test.cc
+++ b/test/cpp/inference/api/xpu_config_resnet50_test.cc
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cmath>
+#include "gflags/gflags.h"
+#include "test/cpp/inference/api/tester_helper.h"
+namespace paddle_infer {
+static const std::vector<float> TRUTH_VALUES = {
+    127.779f,  738.165f,  1013.22f,  -438.17f,  366.401f,  927.659f,  736.222f,
+    -633.684f, -329.927f, -430.155f, -633.062f, -146.548f, -1324.28f, -1349.36f,
+    -242.675f, 117.448f,  -801.723f, -391.514f, -404.818f, 454.16f,   515.48f,
+    -133.031f, 69.293f,   590.096f,  -1434.69f, -1070.89f, 307.074f,  400.525f,
+    -316.12f,  -587.125f, -161.056f, 800.363f,  -96.4708f, 748.706f,  868.174f,
+    -447.938f, 112.737f,  1127.2f,   47.4355f,  677.72f,   593.186f,  -336.4f,
+    551.362f,  397.823f,  78.3979f,  -715.398f, 405.969f,  404.256f,  246.019f,
+    -8.42969f, 131.365f,  -648.051f};
+void PrepareInput(std::shared_ptr<Predictor> predictor) {
+  const int batch = 1;
+  const int channel = 3;
+  const int height = 318;
+  const int width = 318;
+  const int input_num = batch * channel * height * width;
+  std::vector<float> input(input_num, 1);
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  input_t->Reshape({batch, channel, height, width});
+  input_t->CopyFromCpu(input.data());
+}
+void CompareOutput(std::shared_ptr<Predictor> predictor) {
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  size_t out_num = std::accumulate(
+      output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+  float* data_o = out_data.data();
+  for (size_t j = 0; j < out_num; j += 10) {
+    EXPECT_NEAR(
+        (data_o[j] - TRUTH_VALUES[j / 10]) / TRUTH_VALUES[j / 10], 0., 10e-3);
+  }
+}
+TEST(xpu_config, inference) {
+  size_t l3_size = 10 * 1024 * 1024;
+  XpuConfig xpu_config;
+  xpu_config.l3_size = l3_size;
+  std::string model_dir = FLAGS_infer_model + "/" + "model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableXpu();
+  config.SetXpuConfig(xpu_config);
+  XpuConfig xpu_config_test = config.xpu_config();
+  CHECK_EQ(xpu_config_test.l3_size, l3_size);
+  auto predictor = CreatePredictor(config);
+  PrepareInput(predictor);
+  predictor->Run();
+  CompareOutput(predictor);
+}
+TEST(xpu_config, lite) {
+  size_t l3_size = 10 * 1024 * 1024;
+  XpuConfig xpu_config;
+  xpu_config.l3_size = l3_size;
+  std::string model_dir = FLAGS_infer_model + "/" + "model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableXpu();
+  config.SetXpuConfig(xpu_config);
+  config.EnableLiteEngine();
+  XpuConfig xpu_config_test = config.xpu_config();
+  CHECK_EQ(xpu_config_test.l3_size, l3_size);
+  auto predictor = CreatePredictor(config);
+  PrepareInput(predictor);
+  predictor->Run();
+  CompareOutput(predictor);
+}
+}  // namespace paddle_infer
--- a/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
+++ b/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
@@ -63,7 +63,7 @@ void CompareOutput(std::shared_ptr<Predictor> predictor) {
  }
 }
-Config XpuConfig() {
+Config InferXpuConfig() {
  std::string model_dir = FLAGS_infer_model + "/" + "model";
  Config config;
  config.SetModel(model_dir + "/model", model_dir + "/params");
@@ -72,7 +72,7 @@ Config XpuConfig() {
 }
 TEST(resnet50_xpu, basic) {
-  Config config = XpuConfig();
+  Config config = InferXpuConfig();
  auto predictor = CreatePredictor(config);
  PrepareInput(predictor);
  predictor->Run();
@@ -80,7 +80,7 @@ TEST(resnet50_xpu, basic) {
 }
 #define RUN_WITH_RUNTIME_CONFIG(idx_, config_)                             \
-  Config config##idx_ = XpuConfig();                                       \
+  Config config##idx_ = InferXpuConfig();                                  \
  auto predictor##idx_ = CreatePredictor(config##idx_);                    \
  PrepareInput(predictor##idx_);                                           \
  experimental::InternalUtils::RunWithRuntimeConfig(predictor##idx_.get(), \