diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2f31b182af7293488719e41a92b2ea78709bda02..c8c25086db1d58089604555b6c46dac0e6a1251e 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -99,6 +99,10 @@ struct Argument { private: \ unique_ptr_t field__##_; + // Each predictor has an unique id. + // For now, this attr will help us to get the right + // trt_engine for each trt_engine_op for each predictor when using trt. + DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int); // Model path DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string); // Model specified with program and parameters files. diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 59107f28080dceb0a58e17d42281db5f3773de56..9fa85f3762361e2916af6faa820241f5fe7e772f 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -217,6 +217,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir, return ""; } +static std::string GetTrtEngineSerializedPath(const std::string &model_root, + const std::string &engine_key) { + return model_root + "/trt_serialized_" + engine_key; +} + +static std::string GetTrtEngineSerializedData( + const std::string &model_opt_cache_dir, const std::string &engine_key) { + std::string trt_serialized_path = + GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key); + if (FileExists(trt_serialized_path)) { + VLOG(3) << "Trt serialized file: " << trt_serialized_path + << "is found here"; + std::ifstream infile(trt_serialized_path, std::ios::in); + std::stringstream buffer; + buffer << infile.rdbuf(); + std::string trt_engine_serialized_data(buffer.str()); + return trt_engine_serialized_data; + } + return ""; +} + +static void SaveTrtEngineSerializedDataToFile( + const std::string &trt_serialized_path, + const std::string &engine_serialized_data) { + std::ofstream outfile(trt_serialized_path); + outfile << engine_serialized_data; + outfile.close(); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 7476c199cfd073ec0962fa9a48f24750a6484bb5..6fe779524feb2870ff398ca17c4f69427e4a61af 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -81,6 +81,7 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set( "model_opt_cache_dir", new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); + pass->Set("predictor_id", new int(argument->predictor_id())); } pre_pass = pass_name; diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 1da48b5d61ac54b063e9ae7b4bbbf3bcee48976f..7f564f321bdd7dc402a8cd11ad21104a81bdc9e7 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -19,6 +19,8 @@ #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/string/pretty_log.h" @@ -83,7 +85,8 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( } std::string GenerateEngineKey(const std::set &engine_inputs, - const std::set &engine_outputs) { + const std::set &engine_outputs, + const std::string &predictor_id) { std::string engine_hash_key = ""; for (auto name : engine_inputs) { engine_hash_key += name; @@ -91,6 +94,7 @@ std::string GenerateEngineKey(const std::set &engine_inputs, for (auto name : engine_outputs) { engine_hash_key += name; } + engine_hash_key += predictor_id; auto engine_key = std::to_string(std::hash()(engine_hash_key)); return engine_key; } @@ -205,8 +209,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp( SetAttr(op_desc->Proto(), "parameters", params); auto enable_int8 = Get("enable_int8"); - auto engine_key = - GenerateEngineKey(input_names_with_id, output_names_with_id); + int predictor_id = Get("predictor_id"); + auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, + std::to_string(predictor_id)); // Get "" when there is no cached calibration table data. std::string calibration_data = GetTrtCalibTableData( @@ -215,10 +220,53 @@ void TensorRtSubgraphPass::CreateTensorRTOp( SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "engine_key", engine_key); + SetAttr(op_desc->Proto(), "engine_serialized_data", std::string("")); + SetAttr(op_desc->Proto(), "engine_serialized_data_path", + GetTrtEngineSerializedPath(Get("model_opt_cache_dir"), + engine_key)); + + std::unique_ptr calibrator; + if (enable_int8 && calibration_data.size() != 0) { + calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data)); + } - if (!(enable_int8 && calibration_data.size() == 0)) { + // When in int8 mode and calibration_mode, the program just produce the + // calibration table data. + bool calibration_mode = (enable_int8 && calibration_data.size() == 0); + if (!calibration_mode) { std::copy(params.begin(), params.end(), std::back_inserter(*repetitive_params)); + std::string trt_engine_serialized_data = GetTrtEngineSerializedData( + Get("model_opt_cache_dir"), engine_key); + + tensorrt::TensorRTEngine *trt_engine = + inference::Singleton::Global().Create( + Get("max_batch_size"), Get("workspace_size"), enable_int8, + calibrator.get(), engine_key); + if (trt_engine_serialized_data.size() == 0) { + LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; + auto *scope = param_scope(); + framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); + std::unordered_set param_set(params.begin(), params.end()); + inference::Singleton::Global() + .ConvertBlockToTRTEngine( + &block_desc_temp, *scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, trt_engine); + nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); + trt_engine_serialized_data = + std::string((const char *)serialized_engine_data->data(), + serialized_engine_data->size()); + // SaveTrtEngineSerializedDataToFile(GetTrtEngineSerializedPath(Get("model_opt_cache_dir"), + // engine_key), + // trt_engine_serialized_data); + } else { + trt_engine->Deserialize(trt_engine_serialized_data); + } + + SetAttr(op_desc->Proto(), "engine_serialized_data", + trt_engine_serialized_data); } } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index da2e9803f0467f2b83d79cdd06d4317d41630b04..7149f16b360465bee03a77b6542d0433953e484a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -342,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { config_.static_memory_optim_force_update_); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program + argument_.SetPredictorID(predictor_id_); if (!config_.model_dir().empty()) { argument_.SetModelDir(config_.model_dir()); } else { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 014df4ee8b6d86232212736c43a9aff32ffee011..732ea8061b6f0ca39e848d33f91dbbe4c203cc8f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -21,6 +21,7 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/string/printf.h" #ifdef PADDLE_WITH_TESTING @@ -43,7 +44,9 @@ using framework::NaiveExecutor; */ class AnalysisPredictor : public PaddlePredictor { public: - explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {} + explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) { + predictor_id_ = inference::GetUniqueId(); + } ~AnalysisPredictor(); bool Init(const std::shared_ptr &parent_scope, @@ -143,6 +146,7 @@ class AnalysisPredictor : public PaddlePredictor { const size_t max_shape_collect_count_{1000}; int need_collect_var_shapes_{-1}; // -1 for default, 0 for false, 1 for true. std::vector>> batch_var_shapes_; + int predictor_id_; private: // Some status here that help to determine the status inside the predictor. diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index b92781e4f2c612cbb39fcaa7c80b6051a67215fd..ec3bef42fd91cea04a656dd38a4e5c45c1a76476 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -50,6 +50,11 @@ class Timer { } }; +static int GetUniqueId() { + static int id = 0; + return id++; +} + static void split(const std::string &str, char sep, std::vector *pieces) { pieces->clear(); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index ab50758c824e26b96b8813c9c4df99f28e0dd25b..8484daaa128189f70a9171f9b8bcb4fdddf54abf 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -143,6 +143,7 @@ class OpConverter { } } + // The scope here should be inited with the parameter vars. void ConvertBlockToTRTEngine( framework::BlockDesc* block_desc, const framework::Scope& scope, const std::vector& inputs, @@ -151,18 +152,16 @@ class OpConverter { engine->InitNetwork(); for (auto& input : inputs) { if (parameters.count(input)) continue; - auto& t = - inference::analysis::GetFromScope(scope, input); - auto t_shape = framework::vectorize(t.dims()); - auto* var = block_desc->FindVar(input); PADDLE_ENFORCE(var, "no variable called %s", input); PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, "TensorRT engine only takes LoDTensor as input"); + auto var_shape = var->GetShape(); + engine->DeclareInput( input, FluidDataType2TRT( var->Proto()->type().lod_tensor().tensor().data_type()), - Vec2TRT_Dims(t_shape)); + Vec2TRT_Dims(var_shape)); } framework::proto::BlockDesc* block_proto = block_desc->Proto(); ConvertBlock(*block_proto, parameters, scope, engine); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index e1005e9b0335a51bb6a69db8efff70ab2e60576d..cc378f4abdb970099105f7d132ace8df89b68419 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -104,6 +104,34 @@ class TensorRTEngine { nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } + + nvinfer1::IHostMemory* Serialize() { + PADDLE_ENFORCE(infer_engine_ != nullptr, + "You should build engine first and then serialize"); + ihost_memory_.reset(infer_engine_->serialize()); + return ihost_memory_.get(); + } + + void Deserialize(const std::string& engine_serialized_data) { + infer_ptr runtime(createInferRuntime(&logger_)); + infer_engine_.reset( + runtime->deserializeCudaEngine(engine_serialized_data.c_str(), + engine_serialized_data.size(), nullptr)); + PADDLE_ENFORCE(infer_engine_ != nullptr, + "build cuda engine failed when deserialize engine info.!"); + infer_context_.reset(infer_engine_->createExecutionContext()); + } + + void Deserialize(const nvinfer1::IHostMemory* engine_serialized_data) { + infer_ptr runtime(createInferRuntime(&logger_)); + infer_engine_.reset(runtime->deserializeCudaEngine( + engine_serialized_data->data(), engine_serialized_data->size(), + nullptr)); + PADDLE_ENFORCE(infer_engine_ != nullptr, + "build cuda engine failed when deserialize engine info.!"); + infer_context_.reset(infer_engine_->createExecutionContext()); + } + void SetRuntimeBatch(size_t batch_size); int GetRuntimeBatch(); nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, @@ -154,11 +182,11 @@ class TensorRTEngine { infer_ptr infer_network_; infer_ptr infer_engine_; infer_ptr infer_context_; + infer_ptr ihost_memory_; }; // class TensorRTEngine // Add an layer__ into engine__ with args ARGS. // For example: -// TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias) // // Reference // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network @@ -170,6 +198,43 @@ class TensorRTEngine { #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \ engine__->network()->add##layer__(ARGS); +/* + * Helper to control the TensorRT engine's creation and deletion. + */ +class TRTEngineManager { + public: + bool HasEngine(const std::string& name) const { + if (engines_.count(name) == 0) return false; + return engines_.at(name).get() != nullptr; + } + + // Get an engine called `name`. + TensorRTEngine* Get(const std::string& name) const { + return engines_.at(name).get(); + } + + // Create or get an engine called `name` + TensorRTEngine* Create(int max_batch, int max_workspace, bool enable_int8, + TRTInt8Calibrator* calibrator, + const std::string& engine_name) { + std::unique_lock lk(mut_); + auto* p = + new TensorRTEngine(max_batch, max_workspace, enable_int8, calibrator); + engines_[engine_name].reset(p); + return p; + } + + void DeleteALL() { + for (auto& item : engines_) { + item.second.reset(nullptr); + } + } + + private: + std::unordered_map> engines_; + std::mutex mut_; +}; + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 784290fa44f46820736ffef125351437b039f2b4..0975a66ec6fb03be9a4e4be38002fcd760753740 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -191,9 +191,8 @@ TEST_F(TensorRTEngineTest, test_pool2d) { std::vector buffers(2); // TRT binded inputs nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE; - auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, - *const_cast(x), - pool_t, nvinfer1::DimsHW{2, 2}); + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t, + nvinfer1::DimsHW{2, 2}); PADDLE_ENFORCE(pool_layer != nullptr); pool_layer->setStride(nvinfer1::DimsHW{1, 1}); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index 031335009b692f9d1f73070c88e8e79d852cbe36..a8c86de9f9a1aea9ecdedd750757ec7d25cdf2f3 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); AddAttr("calibration_data", "the calibration data for int8"); + AddAttr( + "engine_serialized_data", + "the serialized data contains the all info of the ICUDAEngine"); AddAttr( "engine_key", "The engine_key here is used to distinguish different TRT Engines"); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index dcc046648a0c4843ef538c51cb41931f2c78e289..ab6f403ced6332a0ee7c05b760cdc1a0af9db09b 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -41,13 +41,14 @@ class TensorRTEngineOp : public framework::OperatorBase { private: std::vector input_names_; std::unordered_set param_names_; - mutable std::unique_ptr trt_engine_; + mutable TensorRTEngine *trt_engine_; int max_batch_size_; int workspace_size_; std::unique_ptr calibrator_; bool enable_int8_; std::string calibration_data_; std::string engine_key_; + std::string engine_serialized_data_; bool calibration_mode_; public: @@ -62,6 +63,8 @@ class TensorRTEngineOp : public framework::OperatorBase { enable_int8_ = Attr("enable_int8"); calibration_data_ = Attr("calibration_data"); engine_key_ = Attr("engine_key"); + engine_serialized_data_ = Attr("engine_serialized_data"); + trt_engine_ = nullptr; auto params = Attr>("parameters"); for (const auto ¶m : params) { @@ -78,7 +81,12 @@ class TensorRTEngineOp : public framework::OperatorBase { // we will create an engine here. if (!calibration_mode_) { - // trt_engine_.reset(); + if (inference::Singleton::Global() + .HasEngine(engine_key_)) { + trt_engine_ = inference::Singleton< + inference::tensorrt::TRTEngineManager>::Global() + .Get(engine_key_); + } } } @@ -99,7 +107,7 @@ class TensorRTEngineOp : public framework::OperatorBase { RunCalibration(scope, dev_place); return; } - auto trt_engine = GetEngine(scope, dev_place); + auto *trt_engine = GetEngine(scope, dev_place); RunTrt(scope, dev_place, trt_engine); } @@ -158,7 +166,6 @@ class TensorRTEngineOp : public framework::OperatorBase { auto stream = reinterpret_cast(dev_ctx).stream(); - // auto *engine = trt_engine_.get(); PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); std::vector output_maps = @@ -192,8 +199,9 @@ class TensorRTEngineOp : public framework::OperatorBase { int output_index = 0; VLOG(4) << "TensorRT Engine Op Outputs:"; for (const auto &y : Outputs("Ys")) { - nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]); - auto dims = trt_t->getDimensions(); + const int bind_index = + engine->engine()->getBindingIndex(output_maps[output_index].c_str()); + auto dims = engine->engine()->getBindingDimensions(bind_index); // Use the output ITensor's dims to reshape the Fluid Tensor. // The ITensor doesn't contain the batch size dim. std::vector ddim; @@ -206,8 +214,6 @@ class TensorRTEngineOp : public framework::OperatorBase { auto *fluid_t = fluid_v->GetMutable(); fluid_t->Resize(framework::make_ddim(ddim)); - const int bind_index = - engine->engine()->getBindingIndex(output_maps[output_index].c_str()); PADDLE_ENFORCE(bind_index < num_bindings, "The bind index should be less than num_bindings"); buffers[bind_index] = static_cast(fluid_t->mutable_data( @@ -224,16 +230,14 @@ class TensorRTEngineOp : public framework::OperatorBase { TensorRTEngine *GetEngine(const framework::Scope &scope, const platform::Place &dev_place) const { - if (trt_engine_.get() == nullptr) { - trt_engine_.reset(new TensorRTEngine(max_batch_size_, workspace_size_, - enable_int8_, calibrator_.get())); - if (true) { - PrepareTRTEngine(scope, trt_engine_.get()); - } else { - // create static engine - } + if (trt_engine_ == nullptr) { + trt_engine_ = + inference::Singleton::Global() + .Create(max_batch_size_, workspace_size_, enable_int8_, + calibrator_.get(), engine_key_); + PrepareTRTEngine(scope, trt_engine_); } - return trt_engine_.get(); + return trt_engine_; } void PrepareTRTEngine(const framework::Scope &scope, diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 5a3d9d2c1a3e8111acbad2ddcf4f5469a3a99751..e7ad2f4fe0c654d8928f5793c1ad8052ab766fb5 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetAttr("output_name_mapping", std::vector({"z0"})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + engine_op_desc.SetAttr("engine_serialized_data", std::string("")); LOG(INFO) << "create engine op"; auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); @@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("output_name_mapping", std::vector({"z3"})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + engine_op_desc.SetAttr("engine_serialized_data", std::string("")); auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);