From 08c90086c2691a7bfd787aa57ba2c7c6b37624d1 Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Fri, 16 Jun 2023 10:30:11 +0800
Subject: [PATCH] lite xpu api & clone (#54670)

---
 cmake/external/lite.cmake                     |  2 +-
 paddle/fluid/framework/naive_executor.cc      | 36 +++++++++++++
 paddle/fluid/framework/naive_executor.h       |  2 +
 .../analysis/ir_passes/lite_subgraph_pass.cc  | 52 +++++++++----------
 .../fluid/inference/api/analysis_predictor.cc |  8 +++
 paddle/fluid/inference/lite/engine.cc         | 36 ++++++++++---
 paddle/fluid/inference/lite/engine.h          | 25 ++++++---
 paddle/fluid/operators/lite/lite_engine_op.h  |  4 ++
 8 files changed, 123 insertions(+), 42 deletions(-)
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index ae97b3f061b..515952eae88 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -73,7 +73,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   )
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG 81ef66554099800c143a0feff6e0a491b3b0d12e)
+    set(LITE_GIT_TAG d06a1f36ec564fb618d555b342ca1076623d8b94)
   endif()
 
   if(NOT CUDA_ARCH_NAME)
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 28cabf54ee4..2f886a7eb27 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -31,6 +31,9 @@
 #ifdef PADDLE_WITH_INFERENCE_NVTX
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
+#ifdef PADDLE_WITH_LITE
+#include "paddle/fluid/operators/lite/lite_engine_op.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -271,5 +274,38 @@ void NaiveExecutor::ResetTrtOps(int num) {
 #endif
 }
 
+void NaiveExecutor::CloneLiteEnigne(int num, void *stream) {
+#ifdef PADDLE_WITH_LITE
+  for (auto &op : ops_) {
+    if (op->Type() == "lite_engine") {
+      operators::LiteEngineOp *lite_op =
+          dynamic_cast<operators::LiteEngineOp *>(op.get());
+      PADDLE_ENFORCE_NOT_NULL(
+          lite_op,
+          phi::errors::InvalidArgument(
+              "lite_op(type: lite_engine) should be created."));
+      std::string engine_key = lite_op->Attr<std::string>("engine_key");
+      std::string new_engine_key = engine_key + "_" + std::to_string(num);
+      PADDLE_ENFORCE(
+          paddle::inference::Singleton<inference::lite::EngineManager>::Global()
+              .Has(engine_key),
+          phi::errors::InvalidArgument(
+              "lite_engine(key: %s) should be created.", engine_key));
+      auto *lite_engine =
+          paddle::inference::Singleton<inference::lite::EngineManager>::Global()
+              .Get(engine_key);
+      auto new_lite_engine = lite_engine->Clone();
+#ifdef LITE_SUBGRAPH_WITH_XPU
+      new_lite_engine->SetStream(TARGET(kXPU), stream);
+#endif
+      paddle::inference::Singleton<inference::lite::EngineManager>::Global()
+          .Set(new_engine_key, new_lite_engine);
+      lite_op->SetAttr("engine_key", new_engine_key);
+      lite_op->SetEngine(new_lite_engine.get());
+    }
+  }
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index 8361d79fd18..0c0eb493835 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -73,6 +73,8 @@ class NaiveExecutor {
 
   void ResetTrtOps(int num);
 
+  void CloneLiteEnigne(int num, void* stream);
+
   void RegisterOutputHook(const HookFunc& hookfunc);
 
  private:
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index 70130f67056..ad95fe3091c 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -252,24 +252,7 @@ void LiteSubgraphPass::SetUpEngine(
   bool use_opencl = Get<bool>("use_opencl");
   int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
   bool use_xpu = Get<bool>("use_xpu");
-  int xpu_device_id = Get<int>("xpu_device_id");
-  size_t xpu_l3_size = Get<size_t>("xpu_l3_size");
-  bool xpu_l3_locked = Get<bool>("xpu_l3_locked");
-  bool xpu_conv_autotune = Get<int>("xpu_conv_autotune_level") > 0;
-  std::string xpu_conv_autotune_file =
-      Get<std::string>("xpu_conv_autotune_file");
-  int xpu_gemm_compute_precision = Get<int>("xpu_gemm_compute_precision");
-  std::string xpu_transformer_encoder_precision{"int16"};
-  if (xpu_gemm_compute_precision == 0) {
-    xpu_transformer_encoder_precision = "int8";
-  } else if (xpu_gemm_compute_precision == 1) {
-    xpu_transformer_encoder_precision = "int16";
-  } else if (xpu_gemm_compute_precision == 2) {
-    xpu_transformer_encoder_precision = "int31";
-  }
-  bool xpu_transformer_encoder_adaptive_seqlen =
-      Get<bool>("xpu_transformer_encoder_adaptive_seqlen");
-  bool xpu_enable_multi_stream = Get<bool>("xpu_enable_multi_stream");
+
   // NNAdapter Related
   bool use_nnadapter = Get<bool>("use_nnadapter");
   std::string nnadapter_model_cache_dir =
@@ -354,15 +337,32 @@ void LiteSubgraphPass::SetUpEngine(
   }
 
   config.cpu_math_library_num_threads = cpu_math_library_num_threads;
-  config.xpu_l3_size = xpu_l3_size;
-  config.device_id = xpu_device_id;
-  config.xpu_l3_locked = xpu_l3_locked;
-  config.xpu_conv_autotune = xpu_conv_autotune;
-  config.xpu_conv_autotune_file = xpu_conv_autotune_file;
-  config.xpu_transformer_encoder_precision = xpu_transformer_encoder_precision;
+
+  // xpu related
+  config.xpu_device_id = Get<int>("xpu_device_id");
+  config.xpu_l3_size = Get<size_t>("xpu_l3_size");
+  config.xpu_l3_ptr = Get<void*>("xpu_l3_ptr");
+  config.xpu_l3_autotune_size = Get<size_t>("xpu_l3_autotune_size");
+  config.xpu_stream = Get<void*>("xpu_stream");
+  config.xpu_conv_autotune_level = Get<int>("xpu_conv_autotune_level");
+  config.xpu_conv_autotune_file = Get<std::string>("xpu_conv_autotune_file");
+  config.xpu_conv_autotune_file_writeback =
+      Get<bool>("xpu_conv_autotune_file_writeback");
+  config.xpu_fc_autotune_level = Get<int>("xpu_fc_autotune_level");
+  config.xpu_fc_autotune_file = Get<std::string>("xpu_fc_autotune_file");
+  config.xpu_fc_autotune_file_writeback =
+      Get<bool>("xpu_fc_autotune_file_writeback");
+  config.xpu_gemm_compute_precision = Get<int>("xpu_gemm_compute_precision");
+  config.xpu_transformer_softmax_optimize_level =
+      Get<int>("xpu_transformer_softmax_optimize_level");
   config.xpu_transformer_encoder_adaptive_seqlen =
-      xpu_transformer_encoder_adaptive_seqlen;
-  config.xpu_enable_multi_stream = xpu_enable_multi_stream;
+      Get<bool>("xpu_transformer_encoder_adaptive_seqlen");
+  config.xpu_quant_post_static_gelu_out_threshold =
+      Get<float>("xpu_quant_post_static_gelu_out_threshold");
+  config.xpu_quant_post_dynamic_activation_method =
+      Get<int>("xpu_quant_post_dynamic_activation_method");
+  config.xpu_enable_multi_stream = Get<bool>("xpu_enable_multi_stream");
+
   // NNAdapter Related
   config.nnadapter_model_cache_dir = nnadapter_model_cache_dir;
   config.nnadapter_device_names = nnadapter_device_names;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 373a9b5ffb3..221b358b1a7 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2615,6 +2615,14 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {
   x->Init(scope_, inference_program_);
 #ifdef PADDLE_WITH_TENSORRT
   x->executor_->ResetTrtOps(++AnalysisPredictor::clone_num_);
+#endif
+#ifdef PADDLE_WITH_LITE
+#ifdef LITE_SUBGRAPH_WITH_XPU
+  x->executor_->CloneLiteEnigne(++AnalysisPredictor::clone_num_,
+                                config_.xpu_config_.stream);
+#else
+  x->executor_->CloneLiteEnigne(++AnalysisPredictor::clone_num_, nullptr);
+#endif
 #endif
   return std::unique_ptr<PaddlePredictor>(x);
 }
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index d64888d2887..0895a9bff19 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -24,6 +24,8 @@
 
 #include <utility>
 
+#include "glog/logging.h"
+
 namespace paddle {
 namespace inference {
 namespace lite {
@@ -56,13 +58,28 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
 #endif
 
 #ifdef LITE_SUBGRAPH_WITH_XPU
-  lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_size, cfg.xpu_l3_locked);
-  lite_cxx_config.set_xpu_conv_autotune(cfg.xpu_conv_autotune,
-                                        cfg.xpu_conv_autotune_file);
-  lite_cxx_config.set_xpu_multi_encoder_method(
-      cfg.xpu_transformer_encoder_precision,
-      cfg.xpu_transformer_encoder_adaptive_seqlen);
-  lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id);
+  paddle::lite_api::XpuConfig lite_xpu_config;
+  lite_xpu_config.device_id = cfg.xpu_device_id;
+  lite_xpu_config.l3_size = cfg.xpu_l3_size;
+  lite_xpu_config.l3_ptr = cfg.xpu_l3_ptr;
+  lite_xpu_config.l3_autotune_size = cfg.xpu_l3_size;
+  lite_xpu_config.conv_autotune_level = cfg.xpu_conv_autotune_level;
+  lite_xpu_config.conv_autotune_file = cfg.xpu_conv_autotune_file;
+  lite_xpu_config.conv_autotune_file_writeback =
+      cfg.xpu_conv_autotune_file_writeback;
+  lite_xpu_config.fc_autotune_level = cfg.xpu_fc_autotune_level;
+  lite_xpu_config.fc_autotune_file = cfg.xpu_fc_autotune_file;
+  lite_xpu_config.fc_autotune_file_writeback =
+      cfg.xpu_fc_autotune_file_writeback;
+  lite_xpu_config.gemm_compute_precision = cfg.xpu_gemm_compute_precision;
+  lite_xpu_config.transformer_softmax_optimize_level =
+      cfg.xpu_transformer_softmax_optimize_level;
+  lite_xpu_config.transformer_encoder_adaptive_seqlen =
+      cfg.xpu_transformer_encoder_adaptive_seqlen;
+  lite_xpu_config.quant_post_static_gelu_out_threshold =
+      cfg.xpu_quant_post_static_gelu_out_threshold;
+  lite_xpu_config.quant_post_dynamic_activation_method =
+      cfg.xpu_quant_post_dynamic_activation_method;
   if (cfg.xpu_enable_multi_stream) {
     lite_cxx_config.enable_xpu_multi_stream();
   }
@@ -102,6 +119,11 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
   return engines_[name].get();
 }
 
+void EngineManager::Set(const std::string& name,
+                        std::shared_ptr<paddle::lite_api::PaddlePredictor> p) {
+  engines_[name] = p;
+}
+
 void EngineManager::DeleteAll() {
   for (auto& item : engines_) {
     item.second.reset();
diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h
index 4dfe32b2bc1..aa5e2d72b12 100644
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -36,17 +36,24 @@ struct EngineConfig {
   std::vector<std::string> neglected_passes;
   lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
   bool model_from_memory{true};
-  // TODO(wilber): now only works for xpu, lite gpu can support device_id or
-  // not?
-  int device_id = 0;
 
   // for xpu
+  int xpu_device_id{0};
   size_t xpu_l3_size{0};
-  bool xpu_l3_locked = false;
-  bool xpu_conv_autotune = true;
-  std::string xpu_conv_autotune_file = "";
-  std::string xpu_transformer_encoder_precision = "int16";
-  bool xpu_transformer_encoder_adaptive_seqlen = false;
+  void* xpu_l3_ptr{nullptr};
+  size_t xpu_l3_autotune_size{0};
+  void* xpu_stream{nullptr};
+  int xpu_conv_autotune_level{0};
+  std::string xpu_conv_autotune_file;
+  bool xpu_conv_autotune_file_writeback{false};
+  int xpu_fc_autotune_level{0};
+  std::string xpu_fc_autotune_file;
+  bool xpu_fc_autotune_file_writeback{false};
+  int xpu_gemm_compute_precision{1};
+  int xpu_transformer_softmax_optimize_level{0};
+  bool xpu_transformer_encoder_adaptive_seqlen{true};
+  float xpu_quant_post_static_gelu_out_threshold{10.f};
+  int xpu_quant_post_dynamic_activation_method{0};
   bool xpu_enable_multi_stream = false;
 
   // for x86 or arm
@@ -78,6 +85,8 @@ class EngineManager {
   paddle::lite_api::PaddlePredictor* Get(const std::string& name) const;
   paddle::lite_api::PaddlePredictor* Create(const std::string& name,
                                             const EngineConfig& cfg);
+  void Set(const std::string& name,
+           std::shared_ptr<paddle::lite_api::PaddlePredictor> p);
   void DeleteAll();
 
  private:
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
index 14ee7bc238b..756fec24d98 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -63,6 +63,10 @@ class LiteEngineOp : public framework::OperatorBase {
     zero_copy_ = Attr<bool>("zero_copy");
   }
 
+  void SetEngine(paddle::lite_api::PaddlePredictor *engine) {
+    engine_ = engine;
+  }
+
  protected:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
-- 
GitLab