未验证 提交 8a1124b1 编写于 作者: S shentanyue 提交者: GitHub

[Lite][XPU] Upgrade lite subgraph api of xpu (#47373)

上级 ad251cb5
...@@ -288,6 +288,7 @@ struct Argument { ...@@ -288,6 +288,7 @@ struct Argument {
DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string); DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool); DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool);
DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int); DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
DECL_ARGUMENT_FIELD(xpu_enable_multi_stream, XpuEnableMultiStream, bool);
DECL_ARGUMENT_FIELD(use_nnadapter, UseNNAdapter, bool); DECL_ARGUMENT_FIELD(use_nnadapter, UseNNAdapter, bool);
DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir, DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir,
......
...@@ -263,6 +263,8 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -263,6 +263,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("precision", new std::string(argument->xpu_precision())); pass->Set("precision", new std::string(argument->xpu_precision()));
pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen())); pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen()));
pass->Set("xpu_device_id", new int(argument->xpu_device_id())); pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
pass->Set("enable_multi_stream",
new bool(argument->xpu_enable_multi_stream()));
// NNAdapter Related // NNAdapter Related
pass->Set("use_nnadapter", new bool(argument->use_nnadapter())); pass->Set("use_nnadapter", new bool(argument->use_nnadapter()));
pass->Set("nnadapter_model_cache_dir", pass->Set("nnadapter_model_cache_dir",
......
...@@ -258,6 +258,7 @@ void LiteSubgraphPass::SetUpEngine( ...@@ -258,6 +258,7 @@ void LiteSubgraphPass::SetUpEngine(
std::string autotune_file = Get<std::string>("autotune_file"); std::string autotune_file = Get<std::string>("autotune_file");
std::string precision = Get<std::string>("precision"); std::string precision = Get<std::string>("precision");
bool adaptive_seqlen = Get<bool>("adaptive_seqlen"); bool adaptive_seqlen = Get<bool>("adaptive_seqlen");
bool enable_multi_stream = Get<bool>("enable_multi_stream");
// NNAdapter Related // NNAdapter Related
bool use_nnadapter = Get<bool>("use_nnadapter"); bool use_nnadapter = Get<bool>("use_nnadapter");
std::string nnadapter_model_cache_dir = std::string nnadapter_model_cache_dir =
...@@ -302,7 +303,6 @@ void LiteSubgraphPass::SetUpEngine( ...@@ -302,7 +303,6 @@ void LiteSubgraphPass::SetUpEngine(
// input tensor of the Lite engine is located, and then affects // input tensor of the Lite engine is located, and then affects
// whether tensor sharing is feasible. // whether tensor sharing is feasible.
paddle::lite_api::Place({target_type, precision_type}), paddle::lite_api::Place({target_type, precision_type}),
paddle::lite_api::Place({target_type, PRECISION(kInt64)}),
paddle::lite_api::Place({target_type, PRECISION(kFloat)}), paddle::lite_api::Place({target_type, PRECISION(kFloat)}),
#ifdef PADDLE_WITH_ARM #ifdef PADDLE_WITH_ARM
paddle::lite_api::Place({TARGET(kARM), precision_type}), paddle::lite_api::Place({TARGET(kARM), precision_type}),
...@@ -321,6 +321,7 @@ void LiteSubgraphPass::SetUpEngine( ...@@ -321,6 +321,7 @@ void LiteSubgraphPass::SetUpEngine(
config.autotune_file = autotune_file; config.autotune_file = autotune_file;
config.precision = precision; config.precision = precision;
config.adaptive_seqlen = adaptive_seqlen; config.adaptive_seqlen = adaptive_seqlen;
config.enable_multi_stream = enable_multi_stream;
// NNAdapter Related // NNAdapter Related
config.nnadapter_model_cache_dir = nnadapter_model_cache_dir; config.nnadapter_model_cache_dir = nnadapter_model_cache_dir;
config.nnadapter_device_names = nnadapter_device_names; config.nnadapter_device_names = nnadapter_device_names;
......
...@@ -137,7 +137,8 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, ...@@ -137,7 +137,8 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size,
bool autotune, bool autotune,
const std::string &autotune_file, const std::string &autotune_file,
const std::string &precision, const std::string &precision,
bool adaptive_seqlen) { bool adaptive_seqlen,
bool enable_multi_stream) {
use_xpu_ = true; use_xpu_ = true;
xpu_l3_workspace_size_ = l3_workspace_size; xpu_l3_workspace_size_ = l3_workspace_size;
xpu_locked_ = locked; xpu_locked_ = locked;
...@@ -145,6 +146,7 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, ...@@ -145,6 +146,7 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size,
xpu_autotune_file_ = autotune_file; xpu_autotune_file_ = autotune_file;
xpu_precision_ = precision; xpu_precision_ = precision;
xpu_adaptive_seqlen_ = adaptive_seqlen; xpu_adaptive_seqlen_ = adaptive_seqlen;
xpu_enable_multi_stream_ = enable_multi_stream;
Update(); Update();
} }
...@@ -439,6 +441,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -439,6 +441,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(xpu_autotune_file_); CP_MEMBER(xpu_autotune_file_);
CP_MEMBER(xpu_precision_); CP_MEMBER(xpu_precision_);
CP_MEMBER(xpu_adaptive_seqlen_); CP_MEMBER(xpu_adaptive_seqlen_);
CP_MEMBER(xpu_enable_multi_stream_);
// NPU related. // NPU related.
CP_MEMBER(use_npu_); CP_MEMBER(use_npu_);
...@@ -1020,6 +1023,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -1020,6 +1023,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << xpu_autotune_file_; ss << xpu_autotune_file_;
ss << xpu_precision_; ss << xpu_precision_;
ss << xpu_adaptive_seqlen_; ss << xpu_adaptive_seqlen_;
ss << xpu_enable_multi_stream_;
ss << use_npu_; ss << use_npu_;
ss << npu_device_id_; ss << npu_device_id_;
......
...@@ -1148,6 +1148,7 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -1148,6 +1148,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetXpuPrecision(config_.xpu_precision_); argument_.SetXpuPrecision(config_.xpu_precision_);
argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_); argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
argument_.SetXpuDeviceId(config_.xpu_device_id_); argument_.SetXpuDeviceId(config_.xpu_device_id_);
argument_.SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
// NNAdapter related // NNAdapter related
argument_.SetUseNNAdapter(config_.NNAdapter().use_nnadapter); argument_.SetUseNNAdapter(config_.NNAdapter().use_nnadapter);
argument_.SetNNAdapterDeviceNames( argument_.SetNNAdapterDeviceNames(
......
...@@ -274,13 +274,15 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -274,13 +274,15 @@ struct PD_INFER_DECL AnalysisConfig {
/// file will be used and autotune will not be performed again. /// file will be used and autotune will not be performed again.
/// \param precision Calculation accuracy of multi_encoder /// \param precision Calculation accuracy of multi_encoder
/// \param adaptive_seqlen Is the input of multi_encoder variable length /// \param adaptive_seqlen Is the input of multi_encoder variable length
/// \param enable_multi_stream Whether to enable the multi stream of xpu.
/// ///
void EnableXpu(int l3_workspace_size = 0xfffc00, void EnableXpu(int l3_workspace_size = 0xfffc00,
bool locked = false, bool locked = false,
bool autotune = true, bool autotune = true,
const std::string& autotune_file = "", const std::string& autotune_file = "",
const std::string& precision = "int16", const std::string& precision = "int16",
bool adaptive_seqlen = false); bool adaptive_seqlen = false,
bool enable_multi_stream = false);
/// ///
/// \brief configs of IPU /// \brief configs of IPU
...@@ -1102,6 +1104,7 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -1102,6 +1104,7 @@ struct PD_INFER_DECL AnalysisConfig {
std::string xpu_autotune_file_; std::string xpu_autotune_file_;
std::string xpu_precision_; std::string xpu_precision_;
bool xpu_adaptive_seqlen_; bool xpu_adaptive_seqlen_;
bool xpu_enable_multi_stream_;
// NNAdapter related // NNAdapter related
LiteNNAdapterConfig nnadapter_config_; LiteNNAdapterConfig nnadapter_config_;
......
...@@ -155,14 +155,16 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, ...@@ -155,14 +155,16 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
PD_Bool autotune, PD_Bool autotune,
const char* autotune_file, const char* autotune_file,
const char* precision, const char* precision,
PD_Bool adaptive_seqlen) { PD_Bool adaptive_seqlen,
PD_Bool enable_multi_stream) {
CHECK_AND_CONVERT_PD_CONFIG; CHECK_AND_CONVERT_PD_CONFIG;
config->EnableXpu(l3_workspace_size, config->EnableXpu(l3_workspace_size,
locked, locked,
autotune, autotune,
autotune_file, autotune_file,
precision, precision,
adaptive_seqlen); adaptive_seqlen,
enable_multi_stream);
} }
void PD_ConfigEnableNpu(__pd_keep PD_Config* pd_config, int32_t device_id) { void PD_ConfigEnableNpu(__pd_keep PD_Config* pd_config, int32_t device_id) {
......
...@@ -200,6 +200,7 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization( ...@@ -200,6 +200,7 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization(
/// file will be used and autotune will not be performed again. /// file will be used and autotune will not be performed again.
/// \param precision Calculation accuracy of multi_encoder /// \param precision Calculation accuracy of multi_encoder
/// \param adaptive_seqlen Is the input of multi_encoder variable length /// \param adaptive_seqlen Is the input of multi_encoder variable length
/// \param enable_multi_stream Whether to enable the multi stream of xpu.
/// ///
PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
__pd_keep PD_Config* pd_config, __pd_keep PD_Config* pd_config,
...@@ -208,7 +209,8 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( ...@@ -208,7 +209,8 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
PD_Bool autotune, PD_Bool autotune,
const char* autotune_file, const char* autotune_file,
const char* precision, const char* precision,
PD_Bool adaptive_seqlen); PD_Bool adaptive_seqlen,
PD_Bool enable_multi_stream);
/// ///
/// \brief Turn on NPU. /// \brief Turn on NPU.
/// ///
......
...@@ -199,8 +199,9 @@ func (config *Config) EnableORTOptimization() { ...@@ -199,8 +199,9 @@ func (config *Config) EnableORTOptimization() {
/// \param autotune_file Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again. /// \param autotune_file Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again.
/// \param precision Calculation accuracy of multi_encoder /// \param precision Calculation accuracy of multi_encoder
/// \param adaptive_seqlen Is the input of multi_encoder variable length /// \param adaptive_seqlen Is the input of multi_encoder variable length
/// \param enable_multi_stream Whether to enable the multi stream of xpu
/// ///
func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune bool, autotuneFile string, precision string, adaptiveSeqlen bool) { func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune bool, autotuneFile string, precision string, adaptiveSeqlen bool, enableMultiStream bool) {
cAutotuneFile := C.CString(autotuneFile) cAutotuneFile := C.CString(autotuneFile)
cPrecision := C.CString(precision) cPrecision := C.CString(precision)
defer func() { defer func() {
...@@ -208,7 +209,7 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo ...@@ -208,7 +209,7 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo
C.free(unsafe.Pointer(cPrecision)) C.free(unsafe.Pointer(cPrecision))
}() }()
C.PD_ConfigEnableXpu(config.c, C.int32_t(l3WorkspaceSize), cvtGoBoolToPD(locked), cvtGoBoolToPD(autotune), C.PD_ConfigEnableXpu(config.c, C.int32_t(l3WorkspaceSize), cvtGoBoolToPD(locked), cvtGoBoolToPD(autotune),
cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen)) cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen), cvtGoBoolToPD(enableMultiStream))
} }
/// ///
...@@ -332,9 +333,9 @@ func (config *Config) IrOptim() bool { ...@@ -332,9 +333,9 @@ func (config *Config) IrOptim() bool {
/// \param useCalibMode Use TRT int8 calibration(post training /// \param useCalibMode Use TRT int8 calibration(post training
/// quantization). /// quantization).
/// ///
func (config *Config) EnableTensorRtEngine(workspaceSize int32, maxBatchSize int32, minSubgraphSize int32, func (config *Config) EnableTensorRtEngine(workspaceSize int64, maxBatchSize int32, minSubgraphSize int32,
precision Precision, useStatic bool, useCalibMode bool) { precision Precision, useStatic bool, useCalibMode bool) {
C.PD_ConfigEnableTensorRtEngine(config.c, C.int32_t(workspaceSize), C.int32_t(maxBatchSize), C.int32_t(minSubgraphSize), C.int32_t(precision), cvtGoBoolToPD(useStatic), cvtGoBoolToPD(useCalibMode)) C.PD_ConfigEnableTensorRtEngine(config.c, C.int64_t(workspaceSize), C.int32_t(maxBatchSize), C.int32_t(minSubgraphSize), C.int32_t(precision), cvtGoBoolToPD(useStatic), cvtGoBoolToPD(useCalibMode))
} }
/// ///
......
...@@ -65,6 +65,7 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( ...@@ -65,6 +65,7 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision, lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision,
cfg.adaptive_seqlen); cfg.adaptive_seqlen);
lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id); lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id);
lite_cxx_config.enable_xpu_multi_stream(cfg.enable_multi_stream);
#endif #endif
#ifdef LITE_SUBGRAPH_WITH_NPU #ifdef LITE_SUBGRAPH_WITH_NPU
......
...@@ -50,6 +50,7 @@ struct EngineConfig { ...@@ -50,6 +50,7 @@ struct EngineConfig {
std::string autotune_file = ""; std::string autotune_file = "";
std::string precision = "int16"; std::string precision = "int16";
bool adaptive_seqlen = false; bool adaptive_seqlen = false;
bool enable_multi_stream = false;
// for x86 or arm // for x86 or arm
int cpu_math_library_num_threads{1}; int cpu_math_library_num_threads{1};
......
...@@ -656,7 +656,8 @@ void BindAnalysisConfig(py::module *m) { ...@@ -656,7 +656,8 @@ void BindAnalysisConfig(py::module *m) {
py::arg("autotune") = true, py::arg("autotune") = true,
py::arg("autotune_file") = "", py::arg("autotune_file") = "",
py::arg("precision") = "int16", py::arg("precision") = "int16",
py::arg("adaptive_seqlen") = false) py::arg("adaptive_seqlen") = false,
py::arg("enable_multi_stream") = false)
.def("set_xpu_device_id", .def("set_xpu_device_id",
&AnalysisConfig::SetXpuDeviceId, &AnalysisConfig::SetXpuDeviceId,
py::arg("device_id") = 0) py::arg("device_id") = 0)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册