From ae0f88a988a1b1e53168a6108484d0cfdcb58003 Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Mon, 25 Jan 2021 11:40:47 +0800 Subject: [PATCH] =?UTF-8?q?add=20DLA=20support=EF=BC=9AC++&&Python=20api?= =?UTF-8?q?=20(#30165)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add dla * add dla done * add python api Co-authored-by: shangzhizhou --- paddle/fluid/inference/analysis/argument.h | 2 ++ .../inference/analysis/ir_pass_manager.cc | 2 ++ .../ir_passes/tensorrt_subgraph_pass.cc | 2 ++ paddle/fluid/inference/api/analysis_config.cc | 10 ++++++ .../fluid/inference/api/analysis_predictor.cc | 2 ++ .../inference/api/paddle_analysis_config.h | 17 ++++++++++ paddle/fluid/inference/tensorrt/engine.cc | 23 ++++++++++++++ paddle/fluid/inference/tensorrt/engine.h | 31 +++++++++++++++++-- .../inference/tests/api/trt_mobilenet_test.cc | 1 + paddle/fluid/pybind/inference_api.cc | 3 ++ 10 files changed, 91 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 1bf106ed7c1..bd27b1f5f34 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -199,6 +199,8 @@ struct Argument { DECL_ARGUMENT_FIELD(disable_trt_plugin_fp16, CloseTrtPluginFp16, bool); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); + DECL_ARGUMENT_FIELD(tensorrt_use_dla, TensorRtUseDLA, bool); + DECL_ARGUMENT_FIELD(tensorrt_dla_core, TensorRtDLACore, int); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index a6466c32af8..048424e306e 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -143,6 +143,8 @@ void IRPassManager::CreatePasses(Argument *argument, argument->optim_input_shape())); pass->Set("trt_disabled_ops", new std::vector( argument->tensorrt_disabled_ops())); + pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla())); + pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core())); // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will // not // run fp16. diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 61117cc6032..535f082dccd 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -320,6 +320,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( min_input_shape, max_input_shape, opt_input_shape, disable_trt_plugin_fp16); trt_engine->SetUseOSS(Get("use_oss")); + trt_engine->SetUseDLA(Get("trt_use_dla")); + trt_engine->SetDLACore(Get("trt_dla_core")); trt_engine->SetWithErnie( graph->Has(framework::ir::kEmbEltwiseLayernormPass) && diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 7c6ce00d5d6..3b422fe98c7 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -126,6 +126,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(tensorrt_min_subgraph_size_); CP_MEMBER(tensorrt_precision_mode_); CP_MEMBER(trt_disabled_ops_); + CP_MEMBER(trt_use_dla_); + CP_MEMBER(trt_dla_core_); CP_MEMBER(trt_use_static_engine_); CP_MEMBER(trt_use_calib_mode_); CP_MEMBER(trt_use_oss_); @@ -305,6 +307,11 @@ void AnalysisConfig::SetTRTDynamicShapeInfo( disable_trt_plugin_fp16_ = disable_trt_plugin_fp16; } +void AnalysisConfig::EnableTensorRtDLA(int dla_core) { + trt_use_dla_ = true; + trt_dla_core_ = dla_core; +} + void AnalysisConfig::Exp_DisableTensorRtOPs( const std::vector &ops) { trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end()); @@ -452,6 +459,9 @@ std::string AnalysisConfig::SerializeInfoCache() { for (auto &op : trt_disabled_ops_) ss << op.c_str(); ss << ";"; + ss << trt_use_dla_; + ss << trt_dla_core_; + ss << enable_memory_optim_; ss << use_mkldnn_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d47a9536abc..2fe1b64fcc0 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -477,6 +477,8 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); argument_.SetTensorRtDisabledOPs(config_.trt_disabled_ops_); + argument_.SetTensorRtUseDLA(config_.trt_use_dla_); + argument_.SetTensorRtDLACore(config_.trt_dla_core_); argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index ccc971f99bb..c02af5d9f8c 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -326,6 +326,7 @@ struct PD_INFER_DECL AnalysisConfig { /// V7.2.1 is needed. /// void EnableTensorRtOSS(); + /// /// \brief A boolean state telling whether to use the TensorRT OSS. /// @@ -333,6 +334,20 @@ struct PD_INFER_DECL AnalysisConfig { /// bool tensorrt_oss_enabled() { return trt_use_oss_; } + /// + /// \brief Enable TensorRT DLA + /// \param dla_core ID of DLACore, which should be 0, 1, + /// ..., IBuilder.getNbDLACores() - 1 + /// + void EnableTensorRtDLA(int dla_core = 0); + + /// + /// \brief A boolean state telling whether to use the TensorRT DLA. + /// + /// \return bool Whether to use the TensorRT DLA. + /// + bool tensorrt_dla_enabled() { return trt_use_dla_; } + /// /// \brief Turn on the usage of Lite sub-graph engine. /// @@ -591,6 +606,8 @@ struct PD_INFER_DECL AnalysisConfig { bool trt_use_static_engine_{false}; bool trt_use_calib_mode_{true}; bool trt_use_oss_{false}; + bool trt_use_dla_{false}; + int trt_dla_core_{0}; std::map> min_input_shape_{}; std::map> max_input_shape_{}; std::map> optim_input_shape_{}; diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 90b3e2c0e97..7dc1472bbf0 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -176,6 +176,29 @@ void TensorRTEngine::FreezeNetwork() { } } + if (use_dla_) { + if (!enable_int8 && !enable_fp16) { + LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you " + "set float32, so DLA is not used."; + } else if (infer_builder_->getNbDLACores() == 0) { + LOG(WARNING) + << "TensorRT DLA is set by config, but your device does not have " + "DLA, so DLA is not used."; + } else { + if (dla_core_ < 0 || dla_core_ >= infer_builder_->getNbDLACores()) { + dla_core_ = 0; + LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < " + << infer_builder_->getNbDLACores() << ", but got " + << dla_core_ << ", so use use 0 as default."; + } + infer_builder_->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); + infer_builder_->setDLACore(dla_core_); + infer_builder_->allowGPUFallback(true); + LOG(INFO) << "TensorRT DLA enabled in FreezeNetwork(), DLACore " + << dla_core_; + } + } + if (with_dynamic_shape_) { #if IS_TRT_VERSION_GE(6000) LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode."; diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index cb3f3f94707..0a4cffbe7eb 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -220,6 +220,29 @@ class TensorRTEngine { void Deserialize(const std::string& engine_serialized_data) { freshDeviceId(); infer_ptr runtime(createInferRuntime(&logger_)); + + if (use_dla_) { + if (precision_ != AnalysisConfig::Precision::kInt8 && + precision_ != AnalysisConfig::Precision::kHalf) { + LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you " + "set float32, so DLA is not used."; + } else if (runtime->getNbDLACores() == 0) { + LOG(WARNING) + << "TensorRT DLA is set by config, but your device does not have " + "DLA, so DLA is not used."; + } else { + if (dla_core_ < 0 || dla_core_ >= runtime->getNbDLACores()) { + dla_core_ = 0; + LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < " + << runtime->getNbDLACores() << ", but got " << dla_core_ + << ", so use use 0 as default."; + } + runtime->setDLACore(dla_core_); + LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore " + << dla_core_; + } + } + if (with_dynamic_shape_) { #if IS_TRT_VERSION_GE(6000) infer_engine_.reset(runtime->deserializeCudaEngine( @@ -287,6 +310,8 @@ class TensorRTEngine { } void SetUseOSS(bool use_oss) { use_oss_ = use_oss; } + void SetUseDLA(bool use_dla) { use_dla_ = use_dla; } + void SetDLACore(int dla_core) { dla_core_ = dla_core; } void SetWithErnie(bool with_ernie) { with_ernie_ = with_ernie; } void ClearWeights() { @@ -316,8 +341,8 @@ class TensorRTEngine { ShapeMapType min_input_shape() { return min_input_shape_; } ShapeMapType max_input_shape() { return max_input_shape_; } ShapeMapType optim_input_shape() { return optim_input_shape_; } - bool use_oss() { return use_oss_; }; - bool with_ernie() { return with_ernie_; }; + bool use_oss() { return use_oss_; } + bool with_ernie() { return with_ernie_; } bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; } bool with_dynamic_shape() { return with_dynamic_shape_; } @@ -354,6 +379,8 @@ class TensorRTEngine { ShapeMapType optim_input_shape_; bool disable_trt_plugin_fp16_{false}; bool use_oss_{false}; + bool use_dla_{false}; + int dla_core_{0}; bool with_ernie_{false}; nvinfer1::ILogger& logger_; diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc index 425b6727318..d5d60cc08ab 100644 --- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc +++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc @@ -58,6 +58,7 @@ TEST(PredictorPool, use_gpu) { config.SetModel(model_dir); config.EnableTensorRtEngine(); config.Exp_DisableTensorRtOPs({"fc"}); + config.EnableTensorRtDLA(0); services::PredictorPool pred_pool(config, 1); auto predictor = pred_pool.Retrive(0); diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 61b5c4899e7..0027181189c 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -504,6 +504,9 @@ void BindAnalysisConfig(py::module *m) { py::arg("disable_trt_plugin_fp16") = false) .def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS) .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled) + .def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA, + py::arg("dla_core") = 0) + .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, -- GitLab