未验证 提交 61238d31 编写于 作者: Z Zhaolong Xing 提交者: GitHub

Trt fp16 support (#18860)

* Fix Mask rcnn predictor
    1. refine memory optim algorithm to support the model with the block op.
    2. output diff : modify the affine channel fuse
    3. add condition_block_infer op
add interface for setting trt calib table dir
test=develop

* add the missing files.
test=develop

* 1 add trt fp16 support
test=develop
上级 20859c08
...@@ -84,13 +84,15 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -84,13 +84,15 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("program", pass->Set("program",
new framework::ProgramDesc *(&argument->main_program())); new framework::ProgramDesc *(&argument->main_program()));
bool enable_int8 = argument->tensorrt_precision_mode() == auto precision_mode = argument->tensorrt_precision_mode();
AnalysisConfig::Precision::kInt8; bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8;
pass->Set("predictor_id", new int(argument->predictor_id())); pass->Set("predictor_id", new int(argument->predictor_id()));
bool use_calib_mode = argument->tensorrt_use_calib_mode(); bool use_calib_mode = argument->tensorrt_use_calib_mode();
pass->Set("enable_int8", new bool(enable_int8)); pass->Set("enable_int8", new bool(enable_int8));
pass->Set("use_calib_mode", new bool(use_calib_mode)); pass->Set("use_calib_mode", new bool(use_calib_mode));
pass->Set("precision_mode",
new AnalysisConfig::Precision(precision_mode));
bool use_static_engine = argument->tensorrt_use_static_engine(); bool use_static_engine = argument->tensorrt_use_static_engine();
bool model_from_memory = argument->model_from_memory(); bool model_from_memory = argument->model_from_memory();
......
...@@ -149,6 +149,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -149,6 +149,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
graph_var_map[node->Name()] = node; graph_var_map[node->Name()] = node;
} }
} }
auto precision_mode = Get<AnalysisConfig::Precision>("precision_mode");
bool enable_fp16 = false;
if (precision_mode == AnalysisConfig::Precision::kHalf) enable_fp16 = true;
auto enable_int8 = Get<bool>("enable_int8"); auto enable_int8 = Get<bool>("enable_int8");
auto use_calib_mode = Get<bool>("use_calib_mode"); auto use_calib_mode = Get<bool>("use_calib_mode");
auto &subgraph_nodes = *Agent(node).subgraph(); auto &subgraph_nodes = *Agent(node).subgraph();
...@@ -216,6 +219,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -216,6 +219,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
SetAttr(op_desc->Proto(), "calibration_data", calibration_data); SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "enable_fp16", enable_fp16);
SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode); SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode);
SetAttr(op_desc->Proto(), "engine_key", engine_key); SetAttr(op_desc->Proto(), "engine_key", engine_key);
SetAttr(op_desc->Proto(), "predictor_id", predictor_id); SetAttr(op_desc->Proto(), "predictor_id", predictor_id);
...@@ -244,7 +248,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -244,7 +248,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global() inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Create(engine_key + std::to_string(predictor_id), .Create(engine_key + std::to_string(predictor_id),
Get<int>("max_batch_size"), Get<int>("workspace_size"), Get<int>("max_batch_size"), Get<int>("workspace_size"),
enable_int8, calibrator.get(), Get<int>("gpu_device_id")); precision_mode, calibrator.get(), Get<int>("gpu_device_id"));
bool need_serialize = (use_static_engine && !load_from_memory); bool need_serialize = (use_static_engine && !load_from_memory);
if (need_serialize) { if (need_serialize) {
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -46,6 +46,7 @@ struct AnalysisConfig { ...@@ -46,6 +46,7 @@ struct AnalysisConfig {
enum class Precision { enum class Precision {
kFloat32 = 0, kFloat32 = 0,
kInt8, kInt8,
kHalf,
}; };
/** Set model with a directory. /** Set model with a directory.
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include <gtest/gtest.h> #include <gtest/gtest.h> // NOLINT
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
namespace paddle { namespace paddle {
...@@ -27,10 +27,8 @@ TEST(OpConverter, ConvertBlock) { ...@@ -27,10 +27,8 @@ TEST(OpConverter, ConvertBlock) {
auto* conv2d_op = block->AppendOp(); auto* conv2d_op = block->AppendOp();
// init trt engine // init trt engine
cudaStream_t stream_;
std::unique_ptr<TensorRTEngine> engine_; std::unique_ptr<TensorRTEngine> engine_;
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); engine_.reset(new TensorRTEngine(5, 1 << 15));
engine_.reset(new TensorRTEngine(5, 1 << 15, stream_));
engine_->InitNetwork(); engine_->InitNetwork();
engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT, engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
......
...@@ -80,8 +80,7 @@ class TRTConvertValidation { ...@@ -80,8 +80,7 @@ class TRTConvertValidation {
if_add_batch_(if_add_batch), if_add_batch_(if_add_batch),
max_batch_size_(max_batch_size) { max_batch_size_(max_batch_size) {
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
engine_.reset( engine_.reset(new TensorRTEngine(max_batch_size, workspace_size));
new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0));
engine_->InitNetwork(); engine_->InitNetwork();
} }
......
...@@ -51,7 +51,18 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -51,7 +51,18 @@ void TensorRTEngine::FreezeNetwork() {
// build engine. // build engine.
infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxBatchSize(max_batch_);
infer_builder_->setMaxWorkspaceSize(max_workspace_); infer_builder_->setMaxWorkspaceSize(max_workspace_);
if (enable_int8_) { bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf);
if (enable_fp16) {
bool support_fp16 = infer_builder_->platformHasFastFp16();
infer_builder_->setFp16Mode(support_fp16);
if (!support_fp16) {
LOG(INFO) << "You specify FP16 mode, but the hardware do not support "
"FP16 speed up, use FP32 instead.";
}
}
bool enable_int8 = (precision_ == AnalysisConfig::Precision::kInt8);
if (enable_int8) {
infer_builder_->setInt8Mode(true); infer_builder_->setInt8Mode(true);
if (calibrator_) { if (calibrator_) {
infer_builder_->setInt8Calibrator(calibrator_); infer_builder_->setInt8Calibrator(calibrator_);
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...@@ -61,12 +62,14 @@ class TensorRTEngine { ...@@ -61,12 +62,14 @@ class TensorRTEngine {
nvinfer1::Weights w_; nvinfer1::Weights w_;
}; };
TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false, TensorRTEngine(
int max_batch, int max_workspace,
AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32,
TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, TRTInt8Calibrator* calibrator = nullptr, int device_id = 0,
nvinfer1::ILogger& logger = NaiveLogger::Global()) nvinfer1::ILogger& logger = NaiveLogger::Global())
: max_batch_(max_batch), : max_batch_(max_batch),
max_workspace_(max_workspace), max_workspace_(max_workspace),
enable_int8_(enable_int8), precision_(precision),
calibrator_(calibrator), calibrator_(calibrator),
device_id_(device_id), device_id_(device_id),
logger_(logger) {} logger_(logger) {}
...@@ -168,7 +171,7 @@ class TensorRTEngine { ...@@ -168,7 +171,7 @@ class TensorRTEngine {
// the max memory size the engine uses // the max memory size the engine uses
int max_workspace_; int max_workspace_;
bool enable_int8_; AnalysisConfig::Precision precision_;
TRTInt8Calibrator* calibrator_; TRTInt8Calibrator* calibrator_;
// batch size of the current data, will be updated each Executation. // batch size of the current data, will be updated each Executation.
int batch_size_{-1}; int batch_size_{-1};
...@@ -231,12 +234,12 @@ class TRTEngineManager { ...@@ -231,12 +234,12 @@ class TRTEngineManager {
return engines_.at(name).get(); return engines_.at(name).get();
} }
TensorRTEngine* Create(std::string name, int max_batch, int max_workspace, TensorRTEngine* Create(
bool enable_int8 = false, std::string name, int max_batch, int max_workspace,
TRTInt8Calibrator* calibrator = nullptr, AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32,
int device_id = 0, TRTInt8Calibrator* calibrator = nullptr, int device_id = 0,
nvinfer1::ILogger& logger = NaiveLogger::Global()) { nvinfer1::ILogger& logger = NaiveLogger::Global()) {
auto* p = new TensorRTEngine(max_batch, max_workspace, enable_int8, auto* p = new TensorRTEngine(max_batch, max_workspace, precision,
calibrator, device_id, logger); calibrator, device_id, logger);
engines_[name].reset(p); engines_[name].reset(p);
return p; return p;
......
...@@ -48,12 +48,14 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -48,12 +48,14 @@ class TensorRTEngineOp : public framework::OperatorBase {
int workspace_size_; int workspace_size_;
std::unique_ptr<TRTInt8Calibrator> calibrator_; std::unique_ptr<TRTInt8Calibrator> calibrator_;
bool enable_int8_; bool enable_int8_;
bool enable_fp16_;
bool use_calib_mode_; bool use_calib_mode_;
std::string calibration_data_; std::string calibration_data_;
std::string engine_key_; std::string engine_key_;
bool calibration_mode_; bool calibration_mode_;
int predictor_id_; int predictor_id_;
int device_id_; int device_id_;
AnalysisConfig::Precision precision_mode_;
public: public:
TensorRTEngineOp(const std::string &type, TensorRTEngineOp(const std::string &type,
...@@ -66,6 +68,7 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -66,6 +68,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
workspace_size_ = Attr<int>("workspace_size"); workspace_size_ = Attr<int>("workspace_size");
device_id_ = Attr<int>("gpu_id"); device_id_ = Attr<int>("gpu_id");
enable_int8_ = Attr<bool>("enable_int8"); enable_int8_ = Attr<bool>("enable_int8");
enable_fp16_ = Attr<bool>("enable_fp16");
use_calib_mode_ = Attr<bool>("use_calib_mode"); use_calib_mode_ = Attr<bool>("use_calib_mode");
calibration_data_ = Attr<std::string>("calibration_data"); calibration_data_ = Attr<std::string>("calibration_data");
engine_key_ = Attr<std::string>("engine_key"); engine_key_ = Attr<std::string>("engine_key");
...@@ -93,6 +96,13 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -93,6 +96,13 @@ class TensorRTEngineOp : public framework::OperatorBase {
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global() inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Get(engine_key_ + std::to_string(predictor_id_)); .Get(engine_key_ + std::to_string(predictor_id_));
} }
precision_mode_ = AnalysisConfig::Precision::kFloat32;
if (enable_int8_) {
precision_mode_ = AnalysisConfig::Precision::kInt8;
}
if (enable_fp16_) {
precision_mode_ = AnalysisConfig::Precision::kHalf;
}
} }
protected: protected:
...@@ -141,7 +151,7 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -141,7 +151,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
calib_buffers, runtime_batch, engine_key_, dev_place)); calib_buffers, runtime_batch, engine_key_, dev_place));
calib_res->thr_.reset(new std::thread([&]() { calib_res->thr_.reset(new std::thread([&]() {
calib_res->engine_.reset(new TensorRTEngine( calib_res->engine_.reset(new TensorRTEngine(
max_batch_size_, workspace_size_, enable_int8_, max_batch_size_, workspace_size_, precision_mode_,
calib_res->calib_.get(), calib_res->calib_.get(),
boost::get<platform::CUDAPlace>(dev_place).device)); boost::get<platform::CUDAPlace>(dev_place).device));
VLOG(3) << "start the calib trt engine thread"; VLOG(3) << "start the calib trt engine thread";
...@@ -241,7 +251,7 @@ class TensorRTEngineOp : public framework::OperatorBase { ...@@ -241,7 +251,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
trt_engine_ = trt_engine_ =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global() inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Create(engine_key_ + std::to_string(predictor_id_), .Create(engine_key_ + std::to_string(predictor_id_),
max_batch_size_, workspace_size_, enable_int8_, max_batch_size_, workspace_size_, precision_mode_,
calibrator_.get(), device_id_); calibrator_.get(), device_id_);
PrepareTRTEngine(scope, trt_engine_); PrepareTRTEngine(scope, trt_engine_);
} }
......
...@@ -105,6 +105,7 @@ TEST(TensorRTEngineOp, manual) { ...@@ -105,6 +105,7 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc.SetAttr("predictor_id", 1); engine_op_desc.SetAttr("predictor_id", 1);
engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("calibration_data", std::string(""));
engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false)); engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false)); engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
engine_op_desc.SetAttr("output_name_mapping", engine_op_desc.SetAttr("output_name_mapping",
std::vector<std::string>({"z0"})); std::vector<std::string>({"z0"}));
...@@ -205,6 +206,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -205,6 +206,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
engine_op_desc.SetAttr("predictor_id", 1); engine_op_desc.SetAttr("predictor_id", 1);
engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("calibration_data", std::string(""));
engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false)); engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false)); engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
engine_op_desc.SetAttr("output_name_mapping", engine_op_desc.SetAttr("output_name_mapping",
std::vector<std::string>({"z3"})); std::vector<std::string>({"z3"}));
......
...@@ -199,6 +199,7 @@ void BindAnalysisConfig(py::module *m) { ...@@ -199,6 +199,7 @@ void BindAnalysisConfig(py::module *m) {
py::enum_<AnalysisConfig::Precision>(analysis_config, "Precision") py::enum_<AnalysisConfig::Precision>(analysis_config, "Precision")
.value("Float32", AnalysisConfig::Precision::kFloat32) .value("Float32", AnalysisConfig::Precision::kFloat32)
.value("Int8", AnalysisConfig::Precision::kInt8) .value("Int8", AnalysisConfig::Precision::kInt8)
.value("Half", AnalysisConfig::Precision::kHalf)
.export_values(); .export_values();
analysis_config.def(py::init<const AnalysisConfig &>()) analysis_config.def(py::init<const AnalysisConfig &>())
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册