From 312fe0ece16dc316904318b61f3dacaa8777eade Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 16 Jan 2019 08:27:59 +0000 Subject: [PATCH] add trt int8 calibration support fix comments test=develop --- paddle/fluid/inference/analysis/argument.h | 3 +- paddle/fluid/inference/analysis/helper.cc | 8 +++ paddle/fluid/inference/analysis/helper.h | 8 ++- .../inference/analysis/ir_pass_manager.cc | 10 ++-- .../ir_passes/tensorrt_subgraph_pass.cc | 37 +++++++++----- paddle/fluid/inference/api/analysis_config.cc | 12 ++--- .../fluid/inference/api/analysis_predictor.cc | 49 +++++++++++-------- .../fluid/inference/api/analysis_predictor.h | 13 +++++ .../inference/api/paddle_analysis_config.h | 8 ++- paddle/fluid/inference/tensorrt/engine.cc | 2 +- paddle/fluid/inference/tensorrt/engine.h | 6 +-- .../inference/tensorrt/trt_int8_calibrator.cc | 31 ++++++------ .../inference/tensorrt/trt_int8_calibrator.h | 28 +++++------ .../operators/tensorrt/tensorrt_engine_op.cc | 3 +- .../operators/tensorrt/tensorrt_engine_op.h | 38 +++++++------- 15 files changed, 158 insertions(+), 98 deletions(-) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index b06ff63a741..c317172fa29 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -28,6 +28,7 @@ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/platform/variant.h" namespace paddle { @@ -128,7 +129,7 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, - std::string); + contrib::AnalysisConfig::Precision); // The program transformed by IR analysis phase. DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc index ca40c01fc57..4f5c50d0d6b 100644 --- a/paddle/fluid/inference/analysis/helper.cc +++ b/paddle/fluid/inference/analysis/helper.cc @@ -36,6 +36,14 @@ void SetAttr(framework::proto::OpDesc *op, const std::string &name, attr->set_i(data); } template <> +void SetAttr(framework::proto::OpDesc *op, const std::string &name, + const bool &data) { + auto *attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::BOOLEAN); + attr->set_b(data); +} +template <> void SetAttr(framework::proto::OpDesc *op, const std::string &name, const int64_t &data) { auto *attr = op->add_attrs(); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 5df3aacc3f2..40c94d9904e 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -156,7 +156,7 @@ static bool PathExists(const std::string &path) { return false; } -static std::string SplitPath(const std::string path) { +static std::string GetDirRoot(const std::string path) { char sep = '/'; #ifdef _WIN32 @@ -167,10 +167,14 @@ static std::string SplitPath(const std::string path) { if (i != std::string::npos) { return (path.substr(0, i)); } - return path; } +static std::string GetTrtCalibPath(const std::string &model_root, + const std::string &engine_key) { + return model_root + "/trt_calib_" + engine_key; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index a9960557743..f9ef0a68e9f 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -71,13 +71,17 @@ void IRPassManager::CreatePasses(Argument *argument, "program", new framework::ProgramDesc *( const_cast(&argument->main_program()))); - pass->Set("precision_mode", - new std::string(argument->tensorrt_precision_mode())); + + bool enable_int8 = false; + if (argument->tensorrt_precision_mode() == + contrib::AnalysisConfig::Precision::kInt8) + enable_int8 = true; + + pass->Set("enable_int8", new bool(enable_int8)); pass->Set("model_dir", new std::string(argument->model_path())); } // graph_ = pass->Apply(std::move(graph_)); - pre_pass = pass_name; passes_.emplace_back(std::move(pass)); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 634c5ead0a3..34991b6fbca 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include @@ -93,8 +94,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, } // collect inputs - std::unordered_set input_names; - std::unordered_set input_names_with_id; + std::set input_names; + std::set input_names_with_id; for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); @@ -102,8 +103,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, op_desc->SetInput( "Xs", std::vector(input_names.begin(), input_names.end())); - std::unordered_set output_names; - std::unordered_set output_names_with_id; + std::set output_names; + std::set output_names_with_id; for (auto *x : node->outputs) { output_names.insert(x->Name()); output_names_with_id.insert(x->Name() + std::to_string(x->id())); @@ -203,28 +204,40 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, "the block has no var-desc"); PADDLE_ENFORCE(!output_mapping.empty()); op_desc->SetBlockAttr("sub_block", new_block); - // Set attrs SetAttr(op_desc->Proto(), "subgraph", block_desc.Proto()->SerializeAsString()); + // Set attrs SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); - std::string engine_key = std::to_string( - std::hash()(block_desc.Proto()->SerializeAsString())); - std::string precision_mode = Get("precision_mode"); + auto enable_int8 = Get("enable_int8"); SetAttr(op_desc->Proto(), "calibration_data", std::string("")); - std::string trt_calib_file = - Get("model_dir") + "/trt_calib_" + engine_key; - if (precision_mode == "INT8" && FileExists(trt_calib_file)) { + + // we use the subgraph's inputs and outputs to generate the engine key. + std::string engine_hash_key = ""; + for (auto name : input_names_with_id) { + engine_hash_key += name; + } + for (auto name : output_names_with_id) { + engine_hash_key += name; + } + + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + + auto trt_calib_file = + GetTrtCalibPath(Get("model_dir"), engine_key); + VLOG(3) << "engine key: " << engine_key; + if (enable_int8 && FileExists(trt_calib_file)) { + VLOG(3) << "Calibration table file: " << trt_calib_file << "is found here"; std::ifstream infile(trt_calib_file, std::ios::in); std::stringstream buffer; buffer << infile.rdbuf(); std::string calibration_data(buffer.str()); SetAttr(op_desc->Proto(), "calibration_data", calibration_data); } - SetAttr(op_desc->Proto(), "precision_mode", precision_mode); + SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "engine_key", engine_key); } diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 399db291fd1..7c7efe7a3d5 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -122,13 +122,13 @@ void contrib::AnalysisConfig::EnableMKLDNN() { #endif } -void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, - int max_batch_size, - int min_subgraph_size, - std::string precision_mode) { +void contrib::AnalysisConfig::EnableTensorRtEngine( + int workspace_size, int max_batch_size, int min_subgraph_size, + contrib::AnalysisConfig::Precision precision_mode) { use_tensorrt_ = true; tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; + tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_precision_mode_ = precision_mode; Update(); } @@ -149,7 +149,7 @@ void contrib::AnalysisConfig::Update() { << "TensorRT engine is not available when EnableGpu() not actived."; } else { // Append after the infer_clean pass. - pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); + pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); } } @@ -180,7 +180,7 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() { ss << use_tensorrt_; ss << tensorrt_workspace_size_; ss << tensorrt_max_batchsize_; - ss << tensorrt_precision_mode_; + ss << tensorrt_min_subgraph_size_; ss << use_mkldnn_; ss << enable_ir_optim_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 75c62bb98cb..838016bd766 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -30,9 +30,9 @@ #include "paddle/fluid/inference/api/paddle_inference_pass.h" #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #endif #include "paddle/fluid/inference/analysis/helper.h" -#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" @@ -46,8 +46,8 @@ namespace paddle { using contrib::AnalysisConfig; using inference::Singleton; using inference::tensorrt::TRTInt8Calibrator; -using inference::tensorrt::TRTCalibratorRes; -using inference::tensorrt::TRTCalibratorResManager; +using inference::tensorrt::TRTCalibratorEngine; +using inference::tensorrt::TRTCalibratorEngineManager; namespace { bool IsPersistable(const framework::VarDesc *var) { @@ -334,7 +334,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { !config_.params_file().empty(), "Either model_dir or (param_file, prog_file) should be set."); PADDLE_ENFORCE(!config_.prog_file().empty()); - std::string dir = inference::analysis::SplitPath(config_.prog_file()); + std::string dir = inference::analysis::GetDirRoot(config_.prog_file()); argument_.SetModelPath(dir); argument_.SetModelProgramPath(config_.prog_file()); @@ -562,6 +562,7 @@ bool AnalysisPredictor::LoadParameters() { return true; } +#if PADDLE_WITH_TENSORRT bool AnalysisPredictor::SaveTrtCalibToDisk() { PADDLE_ENFORCE(config_.tensorrt_engine_enabled(), "This func can be invoked only in trt mode"); @@ -570,44 +571,50 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() { if (op_desc->Type() == "tensorrt_engine") { std::string engine_name = boost::get(op_desc->GetAttr("engine_key")); - if (!Singleton::Global().Has(engine_name)) { + if (!Singleton::Global().Has(engine_name)) { LOG(ERROR) << "You should run the predictor(with trt) on the real data " "to generate calibration info"; return false; } - TRTCalibratorRes *calib_res = - Singleton::Global().Get(engine_name); + TRTCalibratorEngine *calib_engine = + Singleton::Global().Get(engine_name); LOG(INFO) << "Wait for calib threads done."; - calib_res->calib_->waitAndSetDone(); + calib_engine->calib_->waitAndSetDone(); LOG(INFO) << "Finish wait."; - calib_res->thr_->join(); - std::string calibration_data = - calib_res->calib_->getCalibrationTableAsString(); + calib_engine->thr_->join(); + std::string calibration_table_data = + calib_engine->calib_->getCalibrationTableAsString(); - if (calibration_data.size() == 0) { + if (calibration_table_data.empty()) { LOG(ERROR) << "the calibration table is empty."; return false; } - std::string calibration_data_path = - argument_.model_path() + "/trt_calib_" + engine_name; - std::ofstream ofile(calibration_data_path, std::ios::out); - LOG(INFO) << "Write Paddle-TRT INT8 calibration data to file " - << calibration_data_path; - ofile << calibration_data; + + std::string calibration_table_data_path = + inference::analysis::GetTrtCalibPath(argument_.model_path(), + engine_name); + + std::ofstream ofile(calibration_table_data_path, std::ios::out); + LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file " + << calibration_table_data_path; + ofile << calibration_table_data; ofile.close(); } } // Free all calibrator resources. - Singleton::Global().DeleteALL(); + Singleton::Global().DeleteALL(); return true; } +#endif AnalysisPredictor::~AnalysisPredictor() { +#if PADDLE_WITH_TENSORRT if (config_.tensorrt_engine_enabled() && - config_.tensorrt_precision_mode_ == "INT8" && - Singleton::Global().Has()) { + config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 && + Singleton::Global().Has()) { SaveTrtCalibToDisk(); } +#endif if (FLAGS_profile) { platform::DisableProfiler(platform::EventSortingKey::kTotal, "./profile.log"); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index cec36a0d3a9..c87987b1671 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -91,7 +91,20 @@ class AnalysisPredictor : public PaddlePredictor { void GetFetchOne(const framework::LoDTensor &fetchs, PaddleTensor *output_data); +#if PADDLE_WITH_TENSORRT + // When we use Paddle-TRT INT8 engine, we need to generate calibration table + // data first, + // the calibration table contains the range for each op's input and output, + // this whole process can be divided into several steps: + // + // 1. Builds a 32-bit engine, runs it on the calibration set, and records a + // histogram for each + // tensor of the distribution of activation values. + // 2. Builds a calibration table from the histograms. + // + // After step 2, we need to store the calibration table on disk bool SaveTrtCalibToDisk(); +#endif ~AnalysisPredictor(); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 14b16d08b34..118af6f4013 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -42,6 +42,10 @@ struct AnalysisConfig { explicit AnalysisConfig(const std::string& model_dir); explicit AnalysisConfig(const std::string& prog_file, const std::string& params_file); + enum class Precision { + kFloat32 = 0, + kInt8, + }; /** Set model with a directory. */ @@ -136,7 +140,7 @@ struct AnalysisConfig { */ void EnableTensorRtEngine(int workspace_size = 1 << 20, int max_batch_size = 1, int min_subgraph_size = 3, - std::string precision = "FP32"); + Precision precision = Precision::kFloat32); /** A boolean state telling whether the TensorRT engine is used. */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } @@ -232,7 +236,7 @@ struct AnalysisConfig { // We set this variable to control the minimum number of nodes in the // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; - std::string tensorrt_precision_mode_; + Precision tensorrt_precision_mode_; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 43f99df4637..808e93d2edc 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -70,7 +70,7 @@ void TensorRTEngine::FreezeNetwork() { // build engine. infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxWorkspaceSize(max_workspace_); - if (precision_mode_ == "INT8") { + if (enable_int8_) { infer_builder_->setInt8Mode(true); PADDLE_ENFORCE( calibrator_ != nullptr, diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 9aed374dce4..788a4493c00 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -58,14 +58,14 @@ class TensorRTEngine : public EngineBase { TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream = nullptr, int device = 0, - std::string precision_mode = "FP32", + bool enable_int8 = "false", TRTInt8Calibrator* calibrator = nullptr, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), stream_(stream ? stream : &default_stream_), device_(device), - precision_mode_(precision_mode), + enable_int8_(enable_int8), calibrator_(calibrator), logger_(logger) { freshDeviceId(); @@ -168,7 +168,7 @@ class TensorRTEngine : public EngineBase { // The specific GPU id that the TensorRTEngine bounded to. int device_; - std::string precision_mode_; + bool enable_int8_; TRTInt8Calibrator* calibrator_; // batch size of the current data, will be updated each Executation. int batch_size_{-1}; diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc index f9356200204..4a85c8b8fe6 100644 --- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc @@ -25,11 +25,7 @@ int TRTInt8Calibrator::getBatchSize() const { return batch_size_; } TRTInt8Calibrator::TRTInt8Calibrator( const std::unordered_map& buffers, int batch_size, std::string engine_name, const platform::Place place) - : batch_size_(batch_size), - calib_running_(true), - data_is_set_(false), - done_(false), - engine_name_(engine_name) { + : batch_size_(batch_size), engine_name_(engine_name) { int i = 0; VLOG(4) << "Init a new calibrator: " << engine_name_; for (const auto it : buffers) { @@ -62,28 +58,32 @@ void TRTInt8Calibrator::waitAndSetDone() { } } +// There might be more than one input for trt subgraph, +// So, we use a map to store input information. bool TRTInt8Calibrator::setBatch( const std::unordered_map& data) { VLOG(3) << "set batch: " << engine_name_; std::unique_lock lk(mut_); + // There is a producer and a consumer. The producer set the batch data and + // the consumer get the batch data. The size of the data pool is one. + // So, the producer has to wait for the consumer to finish processing before + // they can set the data. while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk); + // The done_ is set to true using waitAndSetDone, When all calibration data + // are processed. if (done_) return false; // Sets the batch. - for (const auto it : data) { + for (const auto& it : data) { auto dataptr = data_buffers_.find(it.first); if (dataptr == data_buffers_.end()) { LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first << "' does not match with the buffer names"; } - const auto& d = dataptr->second; - auto status = - cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice); - if (status != cudaSuccess) { - LOG(FATAL) << "cudaMemcpy " << engine_name_ << " for '" << it.first - << "' failed with " << status; - } + PADDLE_ENFORCE( + cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice), + "Fail to cudaMemcpy %s for %s", engine_name_, it.first); } data_is_set_ = true; @@ -95,9 +95,12 @@ bool TRTInt8Calibrator::getBatch(void** bindings, const char** names, int num_bindings) { VLOG(4) << "get batch: " << engine_name_; std::unique_lock lk(mut_); + // The consumer has just finished processing a data. + // The producer can set the data again. calib_running_ = false; cond_.notify_all(); + // As long as there is data in the pool, the consumer can get it. while (!data_is_set_ && !done_) cond_.wait(lk); if (done_) return false; @@ -123,7 +126,7 @@ void TRTInt8Calibrator::setDone() { cond_.notify_all(); } -const void* TRTInt8Calibrator::readCalibrationCache(std::size_t& length) { +const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) { if (calibration_table_.empty()) return nullptr; length = calibration_table_.size(); return calibration_table_.data(); diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h index 13f6e7ad011..919f5d55f88 100644 --- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h @@ -21,8 +21,8 @@ #include #include -#include "NvInfer.h" -#include "cuda_runtime_api.h" +#include +#include #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/platform/place.h" @@ -60,9 +60,9 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { private: const int batch_size_; - bool calib_running_; - bool data_is_set_; - bool done_; + bool calib_running_{true}; + bool data_is_set_{false}; + bool done_{false}; std::mutex mut_; std::condition_variable cond_; @@ -74,9 +74,9 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { std::string calibration_table_; }; -class TRTCalibratorRes { +class TRTCalibratorEngine { public: - TRTCalibratorRes() {} + TRTCalibratorEngine() {} std::unique_ptr calib_; std::unique_ptr thr_; std::unique_ptr engine_; @@ -84,7 +84,7 @@ class TRTCalibratorRes { /* * Manager to control the TensorRT Int8 calibration creation and deltetion. */ -class TRTCalibratorResManager { +class TRTCalibratorEngineManager { public: bool Has() const { return res_.size() > 0; } bool Has(const std::string& name) const { @@ -93,22 +93,22 @@ class TRTCalibratorResManager { } // Get Int8Calibrator via name - TRTCalibratorRes* Get(const std::string& name) const { + TRTCalibratorEngine* Get(const std::string& name) const { return res_.at(name).get(); } // Look up or create a calibrator. - TRTCalibratorRes* LookupOrCreate(const std::string& engine_name) { + TRTCalibratorEngine* LookupOrCreate(const std::string& engine_name) { if (res_.count(engine_name) == 0) { - auto* p = new TRTCalibratorRes(); + auto* p = new TRTCalibratorEngine; res_[engine_name].reset(p); } return res_.at(engine_name).get(); } // Create an Int8Calibrator - TRTCalibratorRes* Create(const std::string& engine_name) { - auto* p = new TRTCalibratorRes(); + TRTCalibratorEngine* Create(const std::string& engine_name) { + auto* p = new TRTCalibratorEngine; res_[engine_name].reset(p); return p; } @@ -120,7 +120,7 @@ class TRTCalibratorResManager { } private: - std::unordered_map> res_; + std::unordered_map> res_; }; } // namespace tensorrt diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index ed177eb18f8..031335009b6 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -36,8 +36,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("max_batch_size", "the maximum batch size."); AddAttr("workspace_size", "the workspace size."); AddAttr("sub_block", "the trt block"); - AddAttr("precision_mode", - "the precision mode: 'FP32', 'INT8' "); + AddAttr("enable_int8", "whether swith to int8 mode"); AddComment("TensorRT engine operator."); } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 57747faec85..d27e013dc47 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -65,8 +65,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { using inference::Singleton; using inference::tensorrt::TensorRTEngine; using inference::tensorrt::TRTInt8Calibrator; -using inference::tensorrt::TRTCalibratorRes; -using inference::tensorrt::TRTCalibratorResManager; +using inference::tensorrt::TRTCalibratorEngine; +using inference::tensorrt::TRTCalibratorEngineManager; class TensorRTEngineOp : public framework::OperatorBase { private: @@ -76,7 +76,7 @@ class TensorRTEngineOp : public framework::OperatorBase { int max_batch_size_; int workspace_size_; std::unique_ptr calibrator_; - std::string precision_mode_; + bool enable_int8_; std::string calibration_data_; std::string engine_key_; bool calibration_mode_; @@ -90,7 +90,7 @@ class TensorRTEngineOp : public framework::OperatorBase { input_names_ = Inputs("Xs"); max_batch_size_ = Attr("max_batch_size"); workspace_size_ = Attr("workspace_size"); - precision_mode_ = Attr("precision_mode"); + enable_int8_ = Attr("enable_int8"); calibration_data_ = Attr("calibration_data"); engine_key_ = Attr("engine_key"); @@ -98,17 +98,19 @@ class TensorRTEngineOp : public framework::OperatorBase { for (const auto ¶m : params) { param_names_.insert(param); } - calibration_mode_ = - (precision_mode_ == "INT8" && calibration_data_.size() == 0); + // calibration_mode is ture represents we need to + // generate the calibration table data. + calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0); - if (precision_mode_ == "INT8" && calibration_data_.size()) { + VLOG(4) << "calibration_mode: " << calibration_mode_; + if (enable_int8_ && calibration_data_.size()) { calibrator_.reset(new TRTInt8Calibrator(calibration_data_)); } } protected: - void RunNative(const framework::Scope &scope, - const platform::Place &dev_place) const { + void RunNativeImpl(const framework::Scope &scope, + const platform::Place &dev_place) const { framework::Executor executor(dev_place); auto *block = Attr("sub_block"); auto *program = block->Program(); @@ -128,12 +130,14 @@ class TensorRTEngineOp : public framework::OperatorBase { void RunCalibration(const framework::Scope &scope, const platform::Place &dev_place) const { - // Create calibrator here. + // This process will builds a 32-bit trt engine, runs it on the calibration + // set, and records a histogram for each + // tensor of the distribution of activation values. LOG(INFO) << "Running calibration trt int8 ..."; int runtime_batch = 1; - if (!Singleton::Global().Has(engine_key_)) { - TRTCalibratorRes *calib_res = - Singleton::Global().Create(engine_key_); + if (!Singleton::Global().Has(engine_key_)) { + TRTCalibratorEngine *calib_res = + Singleton::Global().Create(engine_key_); std::unordered_map calib_buffers; for (auto &x : input_names_) { if (param_names_.count(x)) continue; @@ -148,7 +152,7 @@ class TensorRTEngineOp : public framework::OperatorBase { calib_res->thr_.reset(new std::thread([&]() { calib_res->engine_.reset(new TensorRTEngine( max_batch_size_, workspace_size_, nullptr, - boost::get(dev_place).device, precision_mode_, + boost::get(dev_place).device, enable_int8_, calib_res->calib_.get())); VLOG(3) << "start the calib trt engine thread"; Prepare(scope, dev_place, calib_res->engine_.get()); @@ -156,7 +160,7 @@ class TensorRTEngineOp : public framework::OperatorBase { } TRTInt8Calibrator *temp_calibrator = - Singleton::Global() + Singleton::Global() .Get(engine_key_) ->calib_.get(); std::unordered_map calib_data; @@ -168,7 +172,7 @@ class TensorRTEngineOp : public framework::OperatorBase { calib_data.emplace(x, t.data()); } temp_calibrator->setBatch(calib_data); - RunNative(scope, dev_place); + RunNativeImpl(scope, dev_place); } void RunTrt(const framework::Scope &scope, @@ -178,7 +182,7 @@ class TensorRTEngineOp : public framework::OperatorBase { trt_engine_.reset( new TensorRTEngine(max_batch_size_, workspace_size_, nullptr, boost::get(dev_place).device, - precision_mode_, calibrator_.get())); + enable_int8_, calibrator_.get())); Prepare(scope, dev_place, trt_engine_.get()); } -- GitLab