diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake index ae97b3f061bee7ba064eb76b32a829e348dc7dc0..515952eae88cd9dad980070f4b23c472dbf1defc 100644 --- a/cmake/external/lite.cmake +++ b/cmake/external/lite.cmake @@ -73,7 +73,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) ) if(NOT LITE_GIT_TAG) - set(LITE_GIT_TAG 81ef66554099800c143a0feff6e0a491b3b0d12e) + set(LITE_GIT_TAG d06a1f36ec564fb618d555b342ca1076623d8b94) endif() if(NOT CUDA_ARCH_NAME) diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 28cabf54ee4de0800436fe034aa09d22ea5a9636..2f886a7eb2797cad14ef1f04d10ba981577d493f 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -31,6 +31,9 @@ #ifdef PADDLE_WITH_INFERENCE_NVTX #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif +#ifdef PADDLE_WITH_LITE +#include "paddle/fluid/operators/lite/lite_engine_op.h" +#endif namespace paddle { namespace framework { @@ -271,5 +274,38 @@ void NaiveExecutor::ResetTrtOps(int num) { #endif } +void NaiveExecutor::CloneLiteEnigne(int num, void *stream) { +#ifdef PADDLE_WITH_LITE + for (auto &op : ops_) { + if (op->Type() == "lite_engine") { + operators::LiteEngineOp *lite_op = + dynamic_cast(op.get()); + PADDLE_ENFORCE_NOT_NULL( + lite_op, + phi::errors::InvalidArgument( + "lite_op(type: lite_engine) should be created.")); + std::string engine_key = lite_op->Attr("engine_key"); + std::string new_engine_key = engine_key + "_" + std::to_string(num); + PADDLE_ENFORCE( + paddle::inference::Singleton::Global() + .Has(engine_key), + phi::errors::InvalidArgument( + "lite_engine(key: %s) should be created.", engine_key)); + auto *lite_engine = + paddle::inference::Singleton::Global() + .Get(engine_key); + auto new_lite_engine = lite_engine->Clone(); +#ifdef LITE_SUBGRAPH_WITH_XPU + new_lite_engine->SetStream(TARGET(kXPU), stream); +#endif + paddle::inference::Singleton::Global() + .Set(new_engine_key, new_lite_engine); + lite_op->SetAttr("engine_key", new_engine_key); + lite_op->SetEngine(new_lite_engine.get()); + } + } +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h index 8361d79fd18f12b63e4e7416afaa86be138a358c..0c0eb4938358fbeacdaa77021473bc23c8017878 100644 --- a/paddle/fluid/framework/naive_executor.h +++ b/paddle/fluid/framework/naive_executor.h @@ -73,6 +73,8 @@ class NaiveExecutor { void ResetTrtOps(int num); + void CloneLiteEnigne(int num, void* stream); + void RegisterOutputHook(const HookFunc& hookfunc); private: diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index 70130f67056b19ba126149140aca85f9098f207b..ad95fe3091ce18df88fafdaad2aa90ff86aeebe3 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -252,24 +252,7 @@ void LiteSubgraphPass::SetUpEngine( bool use_opencl = Get("use_opencl"); int cpu_math_library_num_threads = Get("cpu_math_library_num_threads"); bool use_xpu = Get("use_xpu"); - int xpu_device_id = Get("xpu_device_id"); - size_t xpu_l3_size = Get("xpu_l3_size"); - bool xpu_l3_locked = Get("xpu_l3_locked"); - bool xpu_conv_autotune = Get("xpu_conv_autotune_level") > 0; - std::string xpu_conv_autotune_file = - Get("xpu_conv_autotune_file"); - int xpu_gemm_compute_precision = Get("xpu_gemm_compute_precision"); - std::string xpu_transformer_encoder_precision{"int16"}; - if (xpu_gemm_compute_precision == 0) { - xpu_transformer_encoder_precision = "int8"; - } else if (xpu_gemm_compute_precision == 1) { - xpu_transformer_encoder_precision = "int16"; - } else if (xpu_gemm_compute_precision == 2) { - xpu_transformer_encoder_precision = "int31"; - } - bool xpu_transformer_encoder_adaptive_seqlen = - Get("xpu_transformer_encoder_adaptive_seqlen"); - bool xpu_enable_multi_stream = Get("xpu_enable_multi_stream"); + // NNAdapter Related bool use_nnadapter = Get("use_nnadapter"); std::string nnadapter_model_cache_dir = @@ -354,15 +337,32 @@ void LiteSubgraphPass::SetUpEngine( } config.cpu_math_library_num_threads = cpu_math_library_num_threads; - config.xpu_l3_size = xpu_l3_size; - config.device_id = xpu_device_id; - config.xpu_l3_locked = xpu_l3_locked; - config.xpu_conv_autotune = xpu_conv_autotune; - config.xpu_conv_autotune_file = xpu_conv_autotune_file; - config.xpu_transformer_encoder_precision = xpu_transformer_encoder_precision; + + // xpu related + config.xpu_device_id = Get("xpu_device_id"); + config.xpu_l3_size = Get("xpu_l3_size"); + config.xpu_l3_ptr = Get("xpu_l3_ptr"); + config.xpu_l3_autotune_size = Get("xpu_l3_autotune_size"); + config.xpu_stream = Get("xpu_stream"); + config.xpu_conv_autotune_level = Get("xpu_conv_autotune_level"); + config.xpu_conv_autotune_file = Get("xpu_conv_autotune_file"); + config.xpu_conv_autotune_file_writeback = + Get("xpu_conv_autotune_file_writeback"); + config.xpu_fc_autotune_level = Get("xpu_fc_autotune_level"); + config.xpu_fc_autotune_file = Get("xpu_fc_autotune_file"); + config.xpu_fc_autotune_file_writeback = + Get("xpu_fc_autotune_file_writeback"); + config.xpu_gemm_compute_precision = Get("xpu_gemm_compute_precision"); + config.xpu_transformer_softmax_optimize_level = + Get("xpu_transformer_softmax_optimize_level"); config.xpu_transformer_encoder_adaptive_seqlen = - xpu_transformer_encoder_adaptive_seqlen; - config.xpu_enable_multi_stream = xpu_enable_multi_stream; + Get("xpu_transformer_encoder_adaptive_seqlen"); + config.xpu_quant_post_static_gelu_out_threshold = + Get("xpu_quant_post_static_gelu_out_threshold"); + config.xpu_quant_post_dynamic_activation_method = + Get("xpu_quant_post_dynamic_activation_method"); + config.xpu_enable_multi_stream = Get("xpu_enable_multi_stream"); + // NNAdapter Related config.nnadapter_model_cache_dir = nnadapter_model_cache_dir; config.nnadapter_device_names = nnadapter_device_names; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 373a9b5ffb3499a22524f197660f1e497748dfe1..221b358b1a770bf9b4ab25bd367716fb27f8e295 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2615,6 +2615,14 @@ std::unique_ptr AnalysisPredictor::Clone(void *stream) { x->Init(scope_, inference_program_); #ifdef PADDLE_WITH_TENSORRT x->executor_->ResetTrtOps(++AnalysisPredictor::clone_num_); +#endif +#ifdef PADDLE_WITH_LITE +#ifdef LITE_SUBGRAPH_WITH_XPU + x->executor_->CloneLiteEnigne(++AnalysisPredictor::clone_num_, + config_.xpu_config_.stream); +#else + x->executor_->CloneLiteEnigne(++AnalysisPredictor::clone_num_, nullptr); +#endif #endif return std::unique_ptr(x); } diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index d64888d28873a009069d0c12a8a9d2dcca0bd253..0895a9bff1915d0787bbc17738a2d5225ef9d6d1 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -24,6 +24,8 @@ #include +#include "glog/logging.h" + namespace paddle { namespace inference { namespace lite { @@ -56,13 +58,28 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( #endif #ifdef LITE_SUBGRAPH_WITH_XPU - lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_size, cfg.xpu_l3_locked); - lite_cxx_config.set_xpu_conv_autotune(cfg.xpu_conv_autotune, - cfg.xpu_conv_autotune_file); - lite_cxx_config.set_xpu_multi_encoder_method( - cfg.xpu_transformer_encoder_precision, - cfg.xpu_transformer_encoder_adaptive_seqlen); - lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id); + paddle::lite_api::XpuConfig lite_xpu_config; + lite_xpu_config.device_id = cfg.xpu_device_id; + lite_xpu_config.l3_size = cfg.xpu_l3_size; + lite_xpu_config.l3_ptr = cfg.xpu_l3_ptr; + lite_xpu_config.l3_autotune_size = cfg.xpu_l3_size; + lite_xpu_config.conv_autotune_level = cfg.xpu_conv_autotune_level; + lite_xpu_config.conv_autotune_file = cfg.xpu_conv_autotune_file; + lite_xpu_config.conv_autotune_file_writeback = + cfg.xpu_conv_autotune_file_writeback; + lite_xpu_config.fc_autotune_level = cfg.xpu_fc_autotune_level; + lite_xpu_config.fc_autotune_file = cfg.xpu_fc_autotune_file; + lite_xpu_config.fc_autotune_file_writeback = + cfg.xpu_fc_autotune_file_writeback; + lite_xpu_config.gemm_compute_precision = cfg.xpu_gemm_compute_precision; + lite_xpu_config.transformer_softmax_optimize_level = + cfg.xpu_transformer_softmax_optimize_level; + lite_xpu_config.transformer_encoder_adaptive_seqlen = + cfg.xpu_transformer_encoder_adaptive_seqlen; + lite_xpu_config.quant_post_static_gelu_out_threshold = + cfg.xpu_quant_post_static_gelu_out_threshold; + lite_xpu_config.quant_post_dynamic_activation_method = + cfg.xpu_quant_post_dynamic_activation_method; if (cfg.xpu_enable_multi_stream) { lite_cxx_config.enable_xpu_multi_stream(); } @@ -102,6 +119,11 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( return engines_[name].get(); } +void EngineManager::Set(const std::string& name, + std::shared_ptr p) { + engines_[name] = p; +} + void EngineManager::DeleteAll() { for (auto& item : engines_) { item.second.reset(); diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h index 4dfe32b2bc1893525dd7942e2b8bfd542806f95f..aa5e2d72b12fb0e53cc56b293bd71a0f820221e1 100644 --- a/paddle/fluid/inference/lite/engine.h +++ b/paddle/fluid/inference/lite/engine.h @@ -36,17 +36,24 @@ struct EngineConfig { std::vector neglected_passes; lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf}; bool model_from_memory{true}; - // TODO(wilber): now only works for xpu, lite gpu can support device_id or - // not? - int device_id = 0; // for xpu + int xpu_device_id{0}; size_t xpu_l3_size{0}; - bool xpu_l3_locked = false; - bool xpu_conv_autotune = true; - std::string xpu_conv_autotune_file = ""; - std::string xpu_transformer_encoder_precision = "int16"; - bool xpu_transformer_encoder_adaptive_seqlen = false; + void* xpu_l3_ptr{nullptr}; + size_t xpu_l3_autotune_size{0}; + void* xpu_stream{nullptr}; + int xpu_conv_autotune_level{0}; + std::string xpu_conv_autotune_file; + bool xpu_conv_autotune_file_writeback{false}; + int xpu_fc_autotune_level{0}; + std::string xpu_fc_autotune_file; + bool xpu_fc_autotune_file_writeback{false}; + int xpu_gemm_compute_precision{1}; + int xpu_transformer_softmax_optimize_level{0}; + bool xpu_transformer_encoder_adaptive_seqlen{true}; + float xpu_quant_post_static_gelu_out_threshold{10.f}; + int xpu_quant_post_dynamic_activation_method{0}; bool xpu_enable_multi_stream = false; // for x86 or arm @@ -78,6 +85,8 @@ class EngineManager { paddle::lite_api::PaddlePredictor* Get(const std::string& name) const; paddle::lite_api::PaddlePredictor* Create(const std::string& name, const EngineConfig& cfg); + void Set(const std::string& name, + std::shared_ptr p); void DeleteAll(); private: diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h index 14ee7bc238bb72fe892f749c3b2dd846e9368e8f..756fec24d9874c3fdd4b4a75dd79c4ab2d712f6c 100644 --- a/paddle/fluid/operators/lite/lite_engine_op.h +++ b/paddle/fluid/operators/lite/lite_engine_op.h @@ -63,6 +63,10 @@ class LiteEngineOp : public framework::OperatorBase { zero_copy_ = Attr("zero_copy"); } + void SetEngine(paddle::lite_api::PaddlePredictor *engine) { + engine_ = engine; + } + protected: void RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const override {