diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h old mode 100755 new mode 100644 index d855dc999cab811c4f9ac77c8f49e7f108836c1d..b2cdc1a369c36ed5c605005aee73663d35968ccf --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -288,6 +288,7 @@ struct Argument { DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string); DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool); DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int); + DECL_ARGUMENT_FIELD(xpu_enable_multi_stream, XpuEnableMultiStream, bool); DECL_ARGUMENT_FIELD(use_nnadapter, UseNNAdapter, bool); DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir, diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 97ca7c37c7f0e7525a258c39c8b1147e22f10250..4551d8dbf5225243868dc91d2d1d0537c55b3028 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -263,6 +263,8 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("precision", new std::string(argument->xpu_precision())); pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen())); pass->Set("xpu_device_id", new int(argument->xpu_device_id())); + pass->Set("enable_multi_stream", + new bool(argument->xpu_enable_multi_stream())); // NNAdapter Related pass->Set("use_nnadapter", new bool(argument->use_nnadapter())); pass->Set("nnadapter_model_cache_dir", diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index 1c67923657029acd400026d67b7615c953c4eb37..577e3df2e68f8ef9973e80a33438679631a92e63 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -258,6 +258,7 @@ void LiteSubgraphPass::SetUpEngine( std::string autotune_file = Get("autotune_file"); std::string precision = Get("precision"); bool adaptive_seqlen = Get("adaptive_seqlen"); + bool enable_multi_stream = Get("enable_multi_stream"); // NNAdapter Related bool use_nnadapter = Get("use_nnadapter"); std::string nnadapter_model_cache_dir = @@ -302,7 +303,6 @@ void LiteSubgraphPass::SetUpEngine( // input tensor of the Lite engine is located, and then affects // whether tensor sharing is feasible. paddle::lite_api::Place({target_type, precision_type}), - paddle::lite_api::Place({target_type, PRECISION(kInt64)}), paddle::lite_api::Place({target_type, PRECISION(kFloat)}), #ifdef PADDLE_WITH_ARM paddle::lite_api::Place({TARGET(kARM), precision_type}), @@ -321,6 +321,7 @@ void LiteSubgraphPass::SetUpEngine( config.autotune_file = autotune_file; config.precision = precision; config.adaptive_seqlen = adaptive_seqlen; + config.enable_multi_stream = enable_multi_stream; // NNAdapter Related config.nnadapter_model_cache_dir = nnadapter_model_cache_dir; config.nnadapter_device_names = nnadapter_device_names; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index be09976bc4d0ec8b5b395fbf1f3e11f6dbe28e96..8c9f02a4d37b3f985fac9b667ed915ca87fd2a7c 100755 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -137,7 +137,8 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, bool autotune, const std::string &autotune_file, const std::string &precision, - bool adaptive_seqlen) { + bool adaptive_seqlen, + bool enable_multi_stream) { use_xpu_ = true; xpu_l3_workspace_size_ = l3_workspace_size; xpu_locked_ = locked; @@ -145,6 +146,7 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, xpu_autotune_file_ = autotune_file; xpu_precision_ = precision; xpu_adaptive_seqlen_ = adaptive_seqlen; + xpu_enable_multi_stream_ = enable_multi_stream; Update(); } @@ -439,6 +441,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(xpu_autotune_file_); CP_MEMBER(xpu_precision_); CP_MEMBER(xpu_adaptive_seqlen_); + CP_MEMBER(xpu_enable_multi_stream_); // NPU related. CP_MEMBER(use_npu_); @@ -1020,6 +1023,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << xpu_autotune_file_; ss << xpu_precision_; ss << xpu_adaptive_seqlen_; + ss << xpu_enable_multi_stream_; ss << use_npu_; ss << npu_device_id_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 8663ec7d1f09be59888de0b39048a2d1eadfc669..9197efc2a5edbb95f399a204a9f7c1986368100c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1148,6 +1148,7 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetXpuPrecision(config_.xpu_precision_); argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_); argument_.SetXpuDeviceId(config_.xpu_device_id_); + argument_.SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_); // NNAdapter related argument_.SetUseNNAdapter(config_.NNAdapter().use_nnadapter); argument_.SetNNAdapterDeviceNames( diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 5bc50515bf40a72a2eac664e00a94f6722512863..0ed5380e6755c3b4c0078896799e478a86dfaab0 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -274,13 +274,15 @@ struct PD_INFER_DECL AnalysisConfig { /// file will be used and autotune will not be performed again. /// \param precision Calculation accuracy of multi_encoder /// \param adaptive_seqlen Is the input of multi_encoder variable length + /// \param enable_multi_stream Whether to enable the multi stream of xpu. /// void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false, bool autotune = true, const std::string& autotune_file = "", const std::string& precision = "int16", - bool adaptive_seqlen = false); + bool adaptive_seqlen = false, + bool enable_multi_stream = false); /// /// \brief configs of IPU @@ -1102,6 +1104,7 @@ struct PD_INFER_DECL AnalysisConfig { std::string xpu_autotune_file_; std::string xpu_precision_; bool xpu_adaptive_seqlen_; + bool xpu_enable_multi_stream_; // NNAdapter related LiteNNAdapterConfig nnadapter_config_; diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc index b183ba8c63b25cf067976cd70c47c021cb27fd52..6ff88beb70225e6a002c70f1d0d1a79523a00698 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.cc +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -155,14 +155,16 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, PD_Bool autotune, const char* autotune_file, const char* precision, - PD_Bool adaptive_seqlen) { + PD_Bool adaptive_seqlen, + PD_Bool enable_multi_stream) { CHECK_AND_CONVERT_PD_CONFIG; config->EnableXpu(l3_workspace_size, locked, autotune, autotune_file, precision, - adaptive_seqlen); + adaptive_seqlen, + enable_multi_stream); } void PD_ConfigEnableNpu(__pd_keep PD_Config* pd_config, int32_t device_id) { diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h index a7054d53908389bf7102213bf118c874ec5aa05d..feb1d5724438aa18cea2e8a83f844935eec09e5b 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.h +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -200,6 +200,7 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization( /// file will be used and autotune will not be performed again. /// \param precision Calculation accuracy of multi_encoder /// \param adaptive_seqlen Is the input of multi_encoder variable length +/// \param enable_multi_stream Whether to enable the multi stream of xpu. /// PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( __pd_keep PD_Config* pd_config, @@ -208,7 +209,8 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( PD_Bool autotune, const char* autotune_file, const char* precision, - PD_Bool adaptive_seqlen); + PD_Bool adaptive_seqlen, + PD_Bool enable_multi_stream); /// /// \brief Turn on NPU. /// diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go index 0aca2a1075fd3f0c9720905b2decf6b837ddf683..508ac635295605c3c4b3b3f0c1f0437b90b3d22f 100644 --- a/paddle/fluid/inference/goapi/config.go +++ b/paddle/fluid/inference/goapi/config.go @@ -199,8 +199,9 @@ func (config *Config) EnableORTOptimization() { /// \param autotune_file Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again. /// \param precision Calculation accuracy of multi_encoder /// \param adaptive_seqlen Is the input of multi_encoder variable length +/// \param enable_multi_stream Whether to enable the multi stream of xpu /// -func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune bool, autotuneFile string, precision string, adaptiveSeqlen bool) { +func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune bool, autotuneFile string, precision string, adaptiveSeqlen bool, enableMultiStream bool) { cAutotuneFile := C.CString(autotuneFile) cPrecision := C.CString(precision) defer func() { @@ -208,7 +209,7 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo C.free(unsafe.Pointer(cPrecision)) }() C.PD_ConfigEnableXpu(config.c, C.int32_t(l3WorkspaceSize), cvtGoBoolToPD(locked), cvtGoBoolToPD(autotune), - cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen)) + cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen), cvtGoBoolToPD(enableMultiStream)) } /// @@ -332,9 +333,9 @@ func (config *Config) IrOptim() bool { /// \param useCalibMode Use TRT int8 calibration(post training /// quantization). /// -func (config *Config) EnableTensorRtEngine(workspaceSize int32, maxBatchSize int32, minSubgraphSize int32, +func (config *Config) EnableTensorRtEngine(workspaceSize int64, maxBatchSize int32, minSubgraphSize int32, precision Precision, useStatic bool, useCalibMode bool) { - C.PD_ConfigEnableTensorRtEngine(config.c, C.int32_t(workspaceSize), C.int32_t(maxBatchSize), C.int32_t(minSubgraphSize), C.int32_t(precision), cvtGoBoolToPD(useStatic), cvtGoBoolToPD(useCalibMode)) + C.PD_ConfigEnableTensorRtEngine(config.c, C.int64_t(workspaceSize), C.int32_t(maxBatchSize), C.int32_t(minSubgraphSize), C.int32_t(precision), cvtGoBoolToPD(useStatic), cvtGoBoolToPD(useCalibMode)) } /// diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index 300ff99602ccfa6d8446592b065b83845437dc93..3a60077e9fa0b1dcd602d48b6edf358e35068c9b 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -65,6 +65,7 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision, cfg.adaptive_seqlen); lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id); + lite_cxx_config.enable_xpu_multi_stream(cfg.enable_multi_stream); #endif #ifdef LITE_SUBGRAPH_WITH_NPU diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h index adeaca7c1c3b7cad10b5cb76a2c35b3960342a5c..bc38b5efaeb87bc0ac6109b95508f442a82956d8 100644 --- a/paddle/fluid/inference/lite/engine.h +++ b/paddle/fluid/inference/lite/engine.h @@ -50,6 +50,7 @@ struct EngineConfig { std::string autotune_file = ""; std::string precision = "int16"; bool adaptive_seqlen = false; + bool enable_multi_stream = false; // for x86 or arm int cpu_math_library_num_threads{1}; diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index d067f8e47fc48ee0c231d3b8617b6593ec1eab1d..5d2a5799078831b165259af7a3b7359ec7c51428 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -656,7 +656,8 @@ void BindAnalysisConfig(py::module *m) { py::arg("autotune") = true, py::arg("autotune_file") = "", py::arg("precision") = "int16", - py::arg("adaptive_seqlen") = false) + py::arg("adaptive_seqlen") = false, + py::arg("enable_multi_stream") = false) .def("set_xpu_device_id", &AnalysisConfig::SetXpuDeviceId, py::arg("device_id") = 0)