未验证 提交 08c90086 编写于 作者: Z zhupengyang 提交者: GitHub

lite xpu api & clone (#54670)

上级 1a941b71
......@@ -73,7 +73,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
)
if(NOT LITE_GIT_TAG)
set(LITE_GIT_TAG 81ef66554099800c143a0feff6e0a491b3b0d12e)
set(LITE_GIT_TAG d06a1f36ec564fb618d555b342ca1076623d8b94)
endif()
if(NOT CUDA_ARCH_NAME)
......
......@@ -31,6 +31,9 @@
#ifdef PADDLE_WITH_INFERENCE_NVTX
#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
#endif
#ifdef PADDLE_WITH_LITE
#include "paddle/fluid/operators/lite/lite_engine_op.h"
#endif
namespace paddle {
namespace framework {
......@@ -271,5 +274,38 @@ void NaiveExecutor::ResetTrtOps(int num) {
#endif
}
void NaiveExecutor::CloneLiteEnigne(int num, void *stream) {
#ifdef PADDLE_WITH_LITE
for (auto &op : ops_) {
if (op->Type() == "lite_engine") {
operators::LiteEngineOp *lite_op =
dynamic_cast<operators::LiteEngineOp *>(op.get());
PADDLE_ENFORCE_NOT_NULL(
lite_op,
phi::errors::InvalidArgument(
"lite_op(type: lite_engine) should be created."));
std::string engine_key = lite_op->Attr<std::string>("engine_key");
std::string new_engine_key = engine_key + "_" + std::to_string(num);
PADDLE_ENFORCE(
paddle::inference::Singleton<inference::lite::EngineManager>::Global()
.Has(engine_key),
phi::errors::InvalidArgument(
"lite_engine(key: %s) should be created.", engine_key));
auto *lite_engine =
paddle::inference::Singleton<inference::lite::EngineManager>::Global()
.Get(engine_key);
auto new_lite_engine = lite_engine->Clone();
#ifdef LITE_SUBGRAPH_WITH_XPU
new_lite_engine->SetStream(TARGET(kXPU), stream);
#endif
paddle::inference::Singleton<inference::lite::EngineManager>::Global()
.Set(new_engine_key, new_lite_engine);
lite_op->SetAttr("engine_key", new_engine_key);
lite_op->SetEngine(new_lite_engine.get());
}
}
#endif
}
} // namespace framework
} // namespace paddle
......@@ -73,6 +73,8 @@ class NaiveExecutor {
void ResetTrtOps(int num);
void CloneLiteEnigne(int num, void* stream);
void RegisterOutputHook(const HookFunc& hookfunc);
private:
......
......@@ -252,24 +252,7 @@ void LiteSubgraphPass::SetUpEngine(
bool use_opencl = Get<bool>("use_opencl");
int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
bool use_xpu = Get<bool>("use_xpu");
int xpu_device_id = Get<int>("xpu_device_id");
size_t xpu_l3_size = Get<size_t>("xpu_l3_size");
bool xpu_l3_locked = Get<bool>("xpu_l3_locked");
bool xpu_conv_autotune = Get<int>("xpu_conv_autotune_level") > 0;
std::string xpu_conv_autotune_file =
Get<std::string>("xpu_conv_autotune_file");
int xpu_gemm_compute_precision = Get<int>("xpu_gemm_compute_precision");
std::string xpu_transformer_encoder_precision{"int16"};
if (xpu_gemm_compute_precision == 0) {
xpu_transformer_encoder_precision = "int8";
} else if (xpu_gemm_compute_precision == 1) {
xpu_transformer_encoder_precision = "int16";
} else if (xpu_gemm_compute_precision == 2) {
xpu_transformer_encoder_precision = "int31";
}
bool xpu_transformer_encoder_adaptive_seqlen =
Get<bool>("xpu_transformer_encoder_adaptive_seqlen");
bool xpu_enable_multi_stream = Get<bool>("xpu_enable_multi_stream");
// NNAdapter Related
bool use_nnadapter = Get<bool>("use_nnadapter");
std::string nnadapter_model_cache_dir =
......@@ -354,15 +337,32 @@ void LiteSubgraphPass::SetUpEngine(
}
config.cpu_math_library_num_threads = cpu_math_library_num_threads;
config.xpu_l3_size = xpu_l3_size;
config.device_id = xpu_device_id;
config.xpu_l3_locked = xpu_l3_locked;
config.xpu_conv_autotune = xpu_conv_autotune;
config.xpu_conv_autotune_file = xpu_conv_autotune_file;
config.xpu_transformer_encoder_precision = xpu_transformer_encoder_precision;
// xpu related
config.xpu_device_id = Get<int>("xpu_device_id");
config.xpu_l3_size = Get<size_t>("xpu_l3_size");
config.xpu_l3_ptr = Get<void*>("xpu_l3_ptr");
config.xpu_l3_autotune_size = Get<size_t>("xpu_l3_autotune_size");
config.xpu_stream = Get<void*>("xpu_stream");
config.xpu_conv_autotune_level = Get<int>("xpu_conv_autotune_level");
config.xpu_conv_autotune_file = Get<std::string>("xpu_conv_autotune_file");
config.xpu_conv_autotune_file_writeback =
Get<bool>("xpu_conv_autotune_file_writeback");
config.xpu_fc_autotune_level = Get<int>("xpu_fc_autotune_level");
config.xpu_fc_autotune_file = Get<std::string>("xpu_fc_autotune_file");
config.xpu_fc_autotune_file_writeback =
Get<bool>("xpu_fc_autotune_file_writeback");
config.xpu_gemm_compute_precision = Get<int>("xpu_gemm_compute_precision");
config.xpu_transformer_softmax_optimize_level =
Get<int>("xpu_transformer_softmax_optimize_level");
config.xpu_transformer_encoder_adaptive_seqlen =
xpu_transformer_encoder_adaptive_seqlen;
config.xpu_enable_multi_stream = xpu_enable_multi_stream;
Get<bool>("xpu_transformer_encoder_adaptive_seqlen");
config.xpu_quant_post_static_gelu_out_threshold =
Get<float>("xpu_quant_post_static_gelu_out_threshold");
config.xpu_quant_post_dynamic_activation_method =
Get<int>("xpu_quant_post_dynamic_activation_method");
config.xpu_enable_multi_stream = Get<bool>("xpu_enable_multi_stream");
// NNAdapter Related
config.nnadapter_model_cache_dir = nnadapter_model_cache_dir;
config.nnadapter_device_names = nnadapter_device_names;
......
......@@ -2615,6 +2615,14 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {
x->Init(scope_, inference_program_);
#ifdef PADDLE_WITH_TENSORRT
x->executor_->ResetTrtOps(++AnalysisPredictor::clone_num_);
#endif
#ifdef PADDLE_WITH_LITE
#ifdef LITE_SUBGRAPH_WITH_XPU
x->executor_->CloneLiteEnigne(++AnalysisPredictor::clone_num_,
config_.xpu_config_.stream);
#else
x->executor_->CloneLiteEnigne(++AnalysisPredictor::clone_num_, nullptr);
#endif
#endif
return std::unique_ptr<PaddlePredictor>(x);
}
......
......@@ -24,6 +24,8 @@
#include <utility>
#include "glog/logging.h"
namespace paddle {
namespace inference {
namespace lite {
......@@ -56,13 +58,28 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
#endif
#ifdef LITE_SUBGRAPH_WITH_XPU
lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_size, cfg.xpu_l3_locked);
lite_cxx_config.set_xpu_conv_autotune(cfg.xpu_conv_autotune,
cfg.xpu_conv_autotune_file);
lite_cxx_config.set_xpu_multi_encoder_method(
cfg.xpu_transformer_encoder_precision,
cfg.xpu_transformer_encoder_adaptive_seqlen);
lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id);
paddle::lite_api::XpuConfig lite_xpu_config;
lite_xpu_config.device_id = cfg.xpu_device_id;
lite_xpu_config.l3_size = cfg.xpu_l3_size;
lite_xpu_config.l3_ptr = cfg.xpu_l3_ptr;
lite_xpu_config.l3_autotune_size = cfg.xpu_l3_size;
lite_xpu_config.conv_autotune_level = cfg.xpu_conv_autotune_level;
lite_xpu_config.conv_autotune_file = cfg.xpu_conv_autotune_file;
lite_xpu_config.conv_autotune_file_writeback =
cfg.xpu_conv_autotune_file_writeback;
lite_xpu_config.fc_autotune_level = cfg.xpu_fc_autotune_level;
lite_xpu_config.fc_autotune_file = cfg.xpu_fc_autotune_file;
lite_xpu_config.fc_autotune_file_writeback =
cfg.xpu_fc_autotune_file_writeback;
lite_xpu_config.gemm_compute_precision = cfg.xpu_gemm_compute_precision;
lite_xpu_config.transformer_softmax_optimize_level =
cfg.xpu_transformer_softmax_optimize_level;
lite_xpu_config.transformer_encoder_adaptive_seqlen =
cfg.xpu_transformer_encoder_adaptive_seqlen;
lite_xpu_config.quant_post_static_gelu_out_threshold =
cfg.xpu_quant_post_static_gelu_out_threshold;
lite_xpu_config.quant_post_dynamic_activation_method =
cfg.xpu_quant_post_dynamic_activation_method;
if (cfg.xpu_enable_multi_stream) {
lite_cxx_config.enable_xpu_multi_stream();
}
......@@ -102,6 +119,11 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
return engines_[name].get();
}
void EngineManager::Set(const std::string& name,
std::shared_ptr<paddle::lite_api::PaddlePredictor> p) {
engines_[name] = p;
}
void EngineManager::DeleteAll() {
for (auto& item : engines_) {
item.second.reset();
......
......@@ -36,17 +36,24 @@ struct EngineConfig {
std::vector<std::string> neglected_passes;
lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
bool model_from_memory{true};
// TODO(wilber): now only works for xpu, lite gpu can support device_id or
// not?
int device_id = 0;
// for xpu
int xpu_device_id{0};
size_t xpu_l3_size{0};
bool xpu_l3_locked = false;
bool xpu_conv_autotune = true;
std::string xpu_conv_autotune_file = "";
std::string xpu_transformer_encoder_precision = "int16";
bool xpu_transformer_encoder_adaptive_seqlen = false;
void* xpu_l3_ptr{nullptr};
size_t xpu_l3_autotune_size{0};
void* xpu_stream{nullptr};
int xpu_conv_autotune_level{0};
std::string xpu_conv_autotune_file;
bool xpu_conv_autotune_file_writeback{false};
int xpu_fc_autotune_level{0};
std::string xpu_fc_autotune_file;
bool xpu_fc_autotune_file_writeback{false};
int xpu_gemm_compute_precision{1};
int xpu_transformer_softmax_optimize_level{0};
bool xpu_transformer_encoder_adaptive_seqlen{true};
float xpu_quant_post_static_gelu_out_threshold{10.f};
int xpu_quant_post_dynamic_activation_method{0};
bool xpu_enable_multi_stream = false;
// for x86 or arm
......@@ -78,6 +85,8 @@ class EngineManager {
paddle::lite_api::PaddlePredictor* Get(const std::string& name) const;
paddle::lite_api::PaddlePredictor* Create(const std::string& name,
const EngineConfig& cfg);
void Set(const std::string& name,
std::shared_ptr<paddle::lite_api::PaddlePredictor> p);
void DeleteAll();
private:
......
......@@ -63,6 +63,10 @@ class LiteEngineOp : public framework::OperatorBase {
zero_copy_ = Attr<bool>("zero_copy");
}
void SetEngine(paddle::lite_api::PaddlePredictor *engine) {
engine_ = engine;
}
protected:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册