From 61238d31f7f78d0c30650085950acab44f60c85b Mon Sep 17 00:00:00 2001 From: Zhaolong Xing Date: Wed, 31 Jul 2019 16:46:15 +0800 Subject: [PATCH] Trt fp16 support (#18860) * Fix Mask rcnn predictor 1. refine memory optim algorithm to support the model with the block op. 2. output diff : modify the affine channel fuse 3. add condition_block_infer op add interface for setting trt calib table dir test=develop * add the missing files. test=develop * 1 add trt fp16 support test=develop --- .../inference/analysis/ir_pass_manager.cc | 6 +++-- .../ir_passes/tensorrt_subgraph_pass.cc | 6 ++++- .../ir_passes/tensorrt_subgraph_pass.h | 1 + .../inference/api/paddle_analysis_config.h | 1 + .../tensorrt/convert/test_op_converter.cc | 6 ++--- .../inference/tensorrt/convert/ut_helper.h | 3 +-- paddle/fluid/inference/tensorrt/engine.cc | 13 +++++++++- paddle/fluid/inference/tensorrt/engine.h | 25 +++++++++++-------- .../operators/tensorrt/tensorrt_engine_op.h | 14 +++++++++-- .../tensorrt/tensorrt_engine_op_test.cc | 2 ++ paddle/fluid/pybind/inference_api.cc | 1 + 11 files changed, 55 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 2dae5137196..2b7f1dfeaff 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -84,13 +84,15 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); - bool enable_int8 = argument->tensorrt_precision_mode() == - AnalysisConfig::Precision::kInt8; + auto precision_mode = argument->tensorrt_precision_mode(); + bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8; pass->Set("predictor_id", new int(argument->predictor_id())); bool use_calib_mode = argument->tensorrt_use_calib_mode(); pass->Set("enable_int8", new bool(enable_int8)); pass->Set("use_calib_mode", new bool(use_calib_mode)); + pass->Set("precision_mode", + new AnalysisConfig::Precision(precision_mode)); bool use_static_engine = argument->tensorrt_use_static_engine(); bool model_from_memory = argument->model_from_memory(); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index ce8f57c0f04..1d1e4570e1e 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -149,6 +149,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp( graph_var_map[node->Name()] = node; } } + auto precision_mode = Get("precision_mode"); + bool enable_fp16 = false; + if (precision_mode == AnalysisConfig::Precision::kHalf) enable_fp16 = true; auto enable_int8 = Get("enable_int8"); auto use_calib_mode = Get("use_calib_mode"); auto &subgraph_nodes = *Agent(node).subgraph(); @@ -216,6 +219,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( SetAttr(op_desc->Proto(), "calibration_data", calibration_data); SetAttr(op_desc->Proto(), "enable_int8", enable_int8); + SetAttr(op_desc->Proto(), "enable_fp16", enable_fp16); SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode); SetAttr(op_desc->Proto(), "engine_key", engine_key); SetAttr(op_desc->Proto(), "predictor_id", predictor_id); @@ -244,7 +248,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( inference::Singleton::Global() .Create(engine_key + std::to_string(predictor_id), Get("max_batch_size"), Get("workspace_size"), - enable_int8, calibrator.get(), Get("gpu_device_id")); + precision_mode, calibrator.get(), Get("gpu_device_id")); bool need_serialize = (use_static_engine && !load_from_memory); if (need_serialize) { diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h index f530a5a0b33..b6b67ce8ece 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 83143be07a7..0e7673be786 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -46,6 +46,7 @@ struct AnalysisConfig { enum class Precision { kFloat32 = 0, kInt8, + kHalf, }; /** Set model with a directory. diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index c5a413221eb..52655663706 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#include +#include // NOLINT #include "paddle/fluid/framework/program_desc.h" namespace paddle { @@ -27,10 +27,8 @@ TEST(OpConverter, ConvertBlock) { auto* conv2d_op = block->AppendOp(); // init trt engine - cudaStream_t stream_; std::unique_ptr engine_; - PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); - engine_.reset(new TensorRTEngine(5, 1 << 15, stream_)); + engine_.reset(new TensorRTEngine(5, 1 << 15)); engine_->InitNetwork(); engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT, diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 388d83d8345..97affafb4bf 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -80,8 +80,7 @@ class TRTConvertValidation { if_add_batch_(if_add_batch), max_batch_size_(max_batch_size) { PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); - engine_.reset( - new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0)); + engine_.reset(new TensorRTEngine(max_batch_size, workspace_size)); engine_->InitNetwork(); } diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index c5ac6f38410..cc9382419d5 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -51,7 +51,18 @@ void TensorRTEngine::FreezeNetwork() { // build engine. infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxWorkspaceSize(max_workspace_); - if (enable_int8_) { + bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf); + if (enable_fp16) { + bool support_fp16 = infer_builder_->platformHasFastFp16(); + infer_builder_->setFp16Mode(support_fp16); + if (!support_fp16) { + LOG(INFO) << "You specify FP16 mode, but the hardware do not support " + "FP16 speed up, use FP32 instead."; + } + } + bool enable_int8 = (precision_ == AnalysisConfig::Precision::kInt8); + + if (enable_int8) { infer_builder_->setInt8Mode(true); if (calibrator_) { infer_builder_->setInt8Calibrator(calibrator_); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 80af463d274..b242a5ac364 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -22,6 +22,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -61,12 +62,14 @@ class TensorRTEngine { nvinfer1::Weights w_; }; - TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false, - TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, - nvinfer1::ILogger& logger = NaiveLogger::Global()) + TensorRTEngine( + int max_batch, int max_workspace, + AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32, + TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, + nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), - enable_int8_(enable_int8), + precision_(precision), calibrator_(calibrator), device_id_(device_id), logger_(logger) {} @@ -168,7 +171,7 @@ class TensorRTEngine { // the max memory size the engine uses int max_workspace_; - bool enable_int8_; + AnalysisConfig::Precision precision_; TRTInt8Calibrator* calibrator_; // batch size of the current data, will be updated each Executation. int batch_size_{-1}; @@ -231,12 +234,12 @@ class TRTEngineManager { return engines_.at(name).get(); } - TensorRTEngine* Create(std::string name, int max_batch, int max_workspace, - bool enable_int8 = false, - TRTInt8Calibrator* calibrator = nullptr, - int device_id = 0, - nvinfer1::ILogger& logger = NaiveLogger::Global()) { - auto* p = new TensorRTEngine(max_batch, max_workspace, enable_int8, + TensorRTEngine* Create( + std::string name, int max_batch, int max_workspace, + AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32, + TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, + nvinfer1::ILogger& logger = NaiveLogger::Global()) { + auto* p = new TensorRTEngine(max_batch, max_workspace, precision, calibrator, device_id, logger); engines_[name].reset(p); return p; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 79c9f759aae..41492979cd8 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -48,12 +48,14 @@ class TensorRTEngineOp : public framework::OperatorBase { int workspace_size_; std::unique_ptr calibrator_; bool enable_int8_; + bool enable_fp16_; bool use_calib_mode_; std::string calibration_data_; std::string engine_key_; bool calibration_mode_; int predictor_id_; int device_id_; + AnalysisConfig::Precision precision_mode_; public: TensorRTEngineOp(const std::string &type, @@ -66,6 +68,7 @@ class TensorRTEngineOp : public framework::OperatorBase { workspace_size_ = Attr("workspace_size"); device_id_ = Attr("gpu_id"); enable_int8_ = Attr("enable_int8"); + enable_fp16_ = Attr("enable_fp16"); use_calib_mode_ = Attr("use_calib_mode"); calibration_data_ = Attr("calibration_data"); engine_key_ = Attr("engine_key"); @@ -93,6 +96,13 @@ class TensorRTEngineOp : public framework::OperatorBase { inference::Singleton::Global() .Get(engine_key_ + std::to_string(predictor_id_)); } + precision_mode_ = AnalysisConfig::Precision::kFloat32; + if (enable_int8_) { + precision_mode_ = AnalysisConfig::Precision::kInt8; + } + if (enable_fp16_) { + precision_mode_ = AnalysisConfig::Precision::kHalf; + } } protected: @@ -141,7 +151,7 @@ class TensorRTEngineOp : public framework::OperatorBase { calib_buffers, runtime_batch, engine_key_, dev_place)); calib_res->thr_.reset(new std::thread([&]() { calib_res->engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, enable_int8_, + max_batch_size_, workspace_size_, precision_mode_, calib_res->calib_.get(), boost::get(dev_place).device)); VLOG(3) << "start the calib trt engine thread"; @@ -241,7 +251,7 @@ class TensorRTEngineOp : public framework::OperatorBase { trt_engine_ = inference::Singleton::Global() .Create(engine_key_ + std::to_string(predictor_id_), - max_batch_size_, workspace_size_, enable_int8_, + max_batch_size_, workspace_size_, precision_mode_, calibrator_.get(), device_id_); PrepareTRTEngine(scope, trt_engine_); } diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index efc50fc06f4..e813e9ca757 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -105,6 +105,7 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetAttr("predictor_id", 1); engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("enable_fp16", static_cast(false)); engine_op_desc.SetAttr("use_calib_mode", static_cast(false)); engine_op_desc.SetAttr("output_name_mapping", std::vector({"z0"})); @@ -205,6 +206,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("predictor_id", 1); engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("enable_fp16", static_cast(false)); engine_op_desc.SetAttr("use_calib_mode", static_cast(false)); engine_op_desc.SetAttr("output_name_mapping", std::vector({"z3"})); diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index d8664425b7f..ae7fcad7847 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -199,6 +199,7 @@ void BindAnalysisConfig(py::module *m) { py::enum_(analysis_config, "Precision") .value("Float32", AnalysisConfig::Precision::kFloat32) .value("Int8", AnalysisConfig::Precision::kInt8) + .value("Half", AnalysisConfig::Precision::kHalf) .export_values(); analysis_config.def(py::init()) -- GitLab