From 08c90086c2691a7bfd787aa57ba2c7c6b37624d1 Mon Sep 17 00:00:00 2001 From: zhupengyang Date: Fri, 16 Jun 2023 10:30:11 +0800 Subject: [PATCH] lite xpu api & clone (#54670) --- cmake/external/lite.cmake | 2 +- paddle/fluid/framework/naive_executor.cc | 36 +++++++++++++ paddle/fluid/framework/naive_executor.h | 2 + .../analysis/ir_passes/lite_subgraph_pass.cc | 52 +++++++++---------- .../fluid/inference/api/analysis_predictor.cc | 8 +++ paddle/fluid/inference/lite/engine.cc | 36 ++++++++++--- paddle/fluid/inference/lite/engine.h | 25 ++++++--- paddle/fluid/operators/lite/lite_engine_op.h | 4 ++ 8 files changed, 123 insertions(+), 42 deletions(-) diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake index ae97b3f061b..515952eae88 100644 --- a/cmake/external/lite.cmake +++ b/cmake/external/lite.cmake @@ -73,7 +73,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) ) if(NOT LITE_GIT_TAG) - set(LITE_GIT_TAG 81ef66554099800c143a0feff6e0a491b3b0d12e) + set(LITE_GIT_TAG d06a1f36ec564fb618d555b342ca1076623d8b94) endif() if(NOT CUDA_ARCH_NAME) diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 28cabf54ee4..2f886a7eb27 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -31,6 +31,9 @@ #ifdef PADDLE_WITH_INFERENCE_NVTX #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif +#ifdef PADDLE_WITH_LITE +#include "paddle/fluid/operators/lite/lite_engine_op.h" +#endif namespace paddle { namespace framework { @@ -271,5 +274,38 @@ void NaiveExecutor::ResetTrtOps(int num) { #endif } +void NaiveExecutor::CloneLiteEnigne(int num, void *stream) { +#ifdef PADDLE_WITH_LITE + for (auto &op : ops_) { + if (op->Type() == "lite_engine") { + operators::LiteEngineOp *lite_op = + dynamic_cast(op.get()); + PADDLE_ENFORCE_NOT_NULL( + lite_op, + phi::errors::InvalidArgument( + "lite_op(type: lite_engine) should be created.")); + std::string engine_key = lite_op->Attr("engine_key"); + std::string new_engine_key = engine_key + "_" + std::to_string(num); + PADDLE_ENFORCE( + paddle::inference::Singleton::Global() + .Has(engine_key), + phi::errors::InvalidArgument( + "lite_engine(key: %s) should be created.", engine_key)); + auto *lite_engine = + paddle::inference::Singleton::Global() + .Get(engine_key); + auto new_lite_engine = lite_engine->Clone(); +#ifdef LITE_SUBGRAPH_WITH_XPU + new_lite_engine->SetStream(TARGET(kXPU), stream); +#endif + paddle::inference::Singleton::Global() + .Set(new_engine_key, new_lite_engine); + lite_op->SetAttr("engine_key", new_engine_key); + lite_op->SetEngine(new_lite_engine.get()); + } + } +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h index 8361d79fd18..0c0eb493835 100644 --- a/paddle/fluid/framework/naive_executor.h +++ b/paddle/fluid/framework/naive_executor.h @@ -73,6 +73,8 @@ class NaiveExecutor { void ResetTrtOps(int num); + void CloneLiteEnigne(int num, void* stream); + void RegisterOutputHook(const HookFunc& hookfunc); private: diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index 70130f67056..ad95fe3091c 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -252,24 +252,7 @@ void LiteSubgraphPass::SetUpEngine( bool use_opencl = Get("use_opencl"); int cpu_math_library_num_threads = Get("cpu_math_library_num_threads"); bool use_xpu = Get("use_xpu"); - int xpu_device_id = Get("xpu_device_id"); - size_t xpu_l3_size = Get("xpu_l3_size"); - bool xpu_l3_locked = Get("xpu_l3_locked"); - bool xpu_conv_autotune = Get("xpu_conv_autotune_level") > 0; - std::string xpu_conv_autotune_file = - Get("xpu_conv_autotune_file"); - int xpu_gemm_compute_precision = Get("xpu_gemm_compute_precision"); - std::string xpu_transformer_encoder_precision{"int16"}; - if (xpu_gemm_compute_precision == 0) { - xpu_transformer_encoder_precision = "int8"; - } else if (xpu_gemm_compute_precision == 1) { - xpu_transformer_encoder_precision = "int16"; - } else if (xpu_gemm_compute_precision == 2) { - xpu_transformer_encoder_precision = "int31"; - } - bool xpu_transformer_encoder_adaptive_seqlen = - Get("xpu_transformer_encoder_adaptive_seqlen"); - bool xpu_enable_multi_stream = Get("xpu_enable_multi_stream"); + // NNAdapter Related bool use_nnadapter = Get("use_nnadapter"); std::string nnadapter_model_cache_dir = @@ -354,15 +337,32 @@ void LiteSubgraphPass::SetUpEngine( } config.cpu_math_library_num_threads = cpu_math_library_num_threads; - config.xpu_l3_size = xpu_l3_size; - config.device_id = xpu_device_id; - config.xpu_l3_locked = xpu_l3_locked; - config.xpu_conv_autotune = xpu_conv_autotune; - config.xpu_conv_autotune_file = xpu_conv_autotune_file; - config.xpu_transformer_encoder_precision = xpu_transformer_encoder_precision; + + // xpu related + config.xpu_device_id = Get("xpu_device_id"); + config.xpu_l3_size = Get("xpu_l3_size"); + config.xpu_l3_ptr = Get("xpu_l3_ptr"); + config.xpu_l3_autotune_size = Get("xpu_l3_autotune_size"); + config.xpu_stream = Get("xpu_stream"); + config.xpu_conv_autotune_level = Get("xpu_conv_autotune_level"); + config.xpu_conv_autotune_file = Get("xpu_conv_autotune_file"); + config.xpu_conv_autotune_file_writeback = + Get("xpu_conv_autotune_file_writeback"); + config.xpu_fc_autotune_level = Get("xpu_fc_autotune_level"); + config.xpu_fc_autotune_file = Get("xpu_fc_autotune_file"); + config.xpu_fc_autotune_file_writeback = + Get("xpu_fc_autotune_file_writeback"); + config.xpu_gemm_compute_precision = Get("xpu_gemm_compute_precision"); + config.xpu_transformer_softmax_optimize_level = + Get("xpu_transformer_softmax_optimize_level"); config.xpu_transformer_encoder_adaptive_seqlen = - xpu_transformer_encoder_adaptive_seqlen; - config.xpu_enable_multi_stream = xpu_enable_multi_stream; + Get("xpu_transformer_encoder_adaptive_seqlen"); + config.xpu_quant_post_static_gelu_out_threshold = + Get("xpu_quant_post_static_gelu_out_threshold"); + config.xpu_quant_post_dynamic_activation_method = + Get("xpu_quant_post_dynamic_activation_method"); + config.xpu_enable_multi_stream = Get("xpu_enable_multi_stream"); + // NNAdapter Related config.nnadapter_model_cache_dir = nnadapter_model_cache_dir; config.nnadapter_device_names = nnadapter_device_names; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 373a9b5ffb3..221b358b1a7 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2615,6 +2615,14 @@ std::unique_ptr AnalysisPredictor::Clone(void *stream) { x->Init(scope_, inference_program_); #ifdef PADDLE_WITH_TENSORRT x->executor_->ResetTrtOps(++AnalysisPredictor::clone_num_); +#endif +#ifdef PADDLE_WITH_LITE +#ifdef LITE_SUBGRAPH_WITH_XPU + x->executor_->CloneLiteEnigne(++AnalysisPredictor::clone_num_, + config_.xpu_config_.stream); +#else + x->executor_->CloneLiteEnigne(++AnalysisPredictor::clone_num_, nullptr); +#endif #endif return std::unique_ptr(x); } diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index d64888d2887..0895a9bff19 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -24,6 +24,8 @@ #include +#include "glog/logging.h" + namespace paddle { namespace inference { namespace lite { @@ -56,13 +58,28 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( #endif #ifdef LITE_SUBGRAPH_WITH_XPU - lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_size, cfg.xpu_l3_locked); - lite_cxx_config.set_xpu_conv_autotune(cfg.xpu_conv_autotune, - cfg.xpu_conv_autotune_file); - lite_cxx_config.set_xpu_multi_encoder_method( - cfg.xpu_transformer_encoder_precision, - cfg.xpu_transformer_encoder_adaptive_seqlen); - lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id); + paddle::lite_api::XpuConfig lite_xpu_config; + lite_xpu_config.device_id = cfg.xpu_device_id; + lite_xpu_config.l3_size = cfg.xpu_l3_size; + lite_xpu_config.l3_ptr = cfg.xpu_l3_ptr; + lite_xpu_config.l3_autotune_size = cfg.xpu_l3_size; + lite_xpu_config.conv_autotune_level = cfg.xpu_conv_autotune_level; + lite_xpu_config.conv_autotune_file = cfg.xpu_conv_autotune_file; + lite_xpu_config.conv_autotune_file_writeback = + cfg.xpu_conv_autotune_file_writeback; + lite_xpu_config.fc_autotune_level = cfg.xpu_fc_autotune_level; + lite_xpu_config.fc_autotune_file = cfg.xpu_fc_autotune_file; + lite_xpu_config.fc_autotune_file_writeback = + cfg.xpu_fc_autotune_file_writeback; + lite_xpu_config.gemm_compute_precision = cfg.xpu_gemm_compute_precision; + lite_xpu_config.transformer_softmax_optimize_level = + cfg.xpu_transformer_softmax_optimize_level; + lite_xpu_config.transformer_encoder_adaptive_seqlen = + cfg.xpu_transformer_encoder_adaptive_seqlen; + lite_xpu_config.quant_post_static_gelu_out_threshold = + cfg.xpu_quant_post_static_gelu_out_threshold; + lite_xpu_config.quant_post_dynamic_activation_method = + cfg.xpu_quant_post_dynamic_activation_method; if (cfg.xpu_enable_multi_stream) { lite_cxx_config.enable_xpu_multi_stream(); } @@ -102,6 +119,11 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( return engines_[name].get(); } +void EngineManager::Set(const std::string& name, + std::shared_ptr p) { + engines_[name] = p; +} + void EngineManager::DeleteAll() { for (auto& item : engines_) { item.second.reset(); diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h index 4dfe32b2bc1..aa5e2d72b12 100644 --- a/paddle/fluid/inference/lite/engine.h +++ b/paddle/fluid/inference/lite/engine.h @@ -36,17 +36,24 @@ struct EngineConfig { std::vector neglected_passes; lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf}; bool model_from_memory{true}; - // TODO(wilber): now only works for xpu, lite gpu can support device_id or - // not? - int device_id = 0; // for xpu + int xpu_device_id{0}; size_t xpu_l3_size{0}; - bool xpu_l3_locked = false; - bool xpu_conv_autotune = true; - std::string xpu_conv_autotune_file = ""; - std::string xpu_transformer_encoder_precision = "int16"; - bool xpu_transformer_encoder_adaptive_seqlen = false; + void* xpu_l3_ptr{nullptr}; + size_t xpu_l3_autotune_size{0}; + void* xpu_stream{nullptr}; + int xpu_conv_autotune_level{0}; + std::string xpu_conv_autotune_file; + bool xpu_conv_autotune_file_writeback{false}; + int xpu_fc_autotune_level{0}; + std::string xpu_fc_autotune_file; + bool xpu_fc_autotune_file_writeback{false}; + int xpu_gemm_compute_precision{1}; + int xpu_transformer_softmax_optimize_level{0}; + bool xpu_transformer_encoder_adaptive_seqlen{true}; + float xpu_quant_post_static_gelu_out_threshold{10.f}; + int xpu_quant_post_dynamic_activation_method{0}; bool xpu_enable_multi_stream = false; // for x86 or arm @@ -78,6 +85,8 @@ class EngineManager { paddle::lite_api::PaddlePredictor* Get(const std::string& name) const; paddle::lite_api::PaddlePredictor* Create(const std::string& name, const EngineConfig& cfg); + void Set(const std::string& name, + std::shared_ptr p); void DeleteAll(); private: diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h index 14ee7bc238b..756fec24d98 100644 --- a/paddle/fluid/operators/lite/lite_engine_op.h +++ b/paddle/fluid/operators/lite/lite_engine_op.h @@ -63,6 +63,10 @@ class LiteEngineOp : public framework::OperatorBase { zero_copy_ = Attr("zero_copy"); } + void SetEngine(paddle::lite_api::PaddlePredictor *engine) { + engine_ = engine; + } + protected: void RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const override { -- GitLab