From 173b39bb5703c297ae89c6ef442f634c56f2f2bf Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Thu, 22 Sep 2022 14:46:28 +0800 Subject: [PATCH] TensorRT engine context memory sharing (#45842) --- paddle/fluid/inference/analysis/argument.h | 1 + .../inference/analysis/ir_pass_manager.cc | 3 +- .../ir_passes/tensorrt_subgraph_pass.cc | 3 +- paddle/fluid/inference/api/analysis_config.cc | 31 +++- .../fluid/inference/api/analysis_predictor.cc | 8 + .../inference/api/paddle_analysis_config.h | 16 +- paddle/fluid/inference/tensorrt/engine.cc | 99 ++++++++++ paddle/fluid/inference/tensorrt/engine.h | 171 ++++++++---------- .../operators/tensorrt/tensorrt_engine_op.h | 7 +- 9 files changed, 227 insertions(+), 112 deletions(-) mode change 100755 => 100644 paddle/fluid/inference/api/analysis_predictor.cc diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index b0ed905bfc6..871718eff14 100755 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -314,6 +314,7 @@ struct Argument { // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); + DECL_ARGUMENT_FIELD(trt_engine_memory_sharing, TrtEngineMemorySharing, bool); // Indicate which kind of sort algorithm is used for operators, the memory // optimization relays on the sort algorithm. diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index f86a22e3db9..e1fe856e3c0 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -145,7 +145,8 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("use_calib_mode", new bool(use_calib_mode)); pass->Set("precision_mode", new AnalysisConfig::Precision(precision_mode)); - + pass->Set("context_memory_sharing", + new bool(argument->trt_engine_memory_sharing())); bool use_static_engine = argument->tensorrt_use_static_engine(); bool model_from_memory = argument->model_from_memory(); std::string optim_cache_dir = argument->optim_cache_dir(); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 60e0864a9be..05d66d7fe27 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -164,11 +164,9 @@ void analysis::TensorRtSubgraphPass::ApplyImpl( // those parameter already exist in trt, and should not have another copy in // fluid. std::vector repetitive_params; - for (auto *node : graph->Nodes()) { if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) { CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params); - std::unordered_set nodes2remove( framework::ir::Agent(node).subgraph()->begin(), framework::ir::Agent(node).subgraph()->end()); @@ -527,6 +525,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( trt_engine->SetWithErnie( graph->Has(framework::ir::kEmbEltwiseLayernormPass) && graph->Has(framework::ir::kMultiheadMatmulPass)); + trt_engine->SetContextMemorySharing(Get("context_memory_sharing")); if (use_static_engine) { trt_engine_serialized_data = GetTrtEngineSerializedData( diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index f3fbf1c344d..97f6d81e592 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -281,6 +281,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(collect_shape_range_info_); CP_MEMBER(shape_range_info_path_); CP_MEMBER(trt_use_inspector_); + CP_MEMBER(trt_engine_memory_sharing_); // Dlnne related CP_MEMBER(use_dlnne_); CP_MEMBER(dlnne_min_subgraph_size_); @@ -546,6 +547,19 @@ void AnalysisConfig::EnableTensorRtEngine( } use_tensorrt_ = true; +#if PADDLE_WITH_TENSORRT + // https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2 + // when trt version less than 7.2, + // createExecutionContextWithoutDeviceMemory() has bug. + // so, we cannot enable engine context memory sharing. +#if IS_TRT_VERSION_GE(7200) + trt_engine_memory_sharing_ = true; +#else + LOG(WARNING) + << "TensorRT engine context memory sharing needs version 7.2 and after."; + trt_engine_memory_sharing_ = false; +#endif +#endif tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; tensorrt_min_subgraph_size_ = min_subgraph_size; @@ -608,7 +622,7 @@ void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; } // TODO(Superjomn) refactor this, buggy. void AnalysisConfig::Update() { - auto info = SerializeInfoCache(); + auto &&info = SerializeInfoCache(); if (info == serialized_info_cache_) return; // Transfer pass_builder and copy the existing compatible passes. @@ -861,6 +875,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << trt_dla_core_; ss << enable_memory_optim_; + ss << trt_engine_memory_sharing_; ss << use_mkldnn_; ss << mkldnn_cache_capacity_; @@ -951,6 +966,10 @@ bool AnalysisConfig::enable_memory_optim() const { return enable_memory_optim_; } +bool AnalysisConfig::trt_engine_memory_sharing() const { + return trt_engine_memory_sharing_; +} + void AnalysisConfig::SetModelBuffer(const char *prog_buffer, size_t prog_buffer_size, const char *param_buffer, @@ -1108,6 +1127,8 @@ std::string AnalysisConfig::Summary() { if (trt_use_dla_) { os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)}); } + os.InsertRow({"trt_engine_memory_sharing", + trt_engine_memory_sharing_ ? "true" : "false"}); #endif } } @@ -1211,11 +1232,11 @@ void AnalysisConfig::CollectShapeRangeInfo( shape_range_info_path_ = shape_range_info_path; } -const std::string &AnalysisConfig::shape_range_info_path() { +const std::string &AnalysisConfig::shape_range_info_path() const { return shape_range_info_path_; } -bool AnalysisConfig::shape_range_info_collected() { +bool AnalysisConfig::shape_range_info_collected() const { return collect_shape_range_info_; } @@ -1226,11 +1247,11 @@ void AnalysisConfig::EnableTunedTensorRtDynamicShape( trt_tuned_dynamic_shape_ = true; } -bool AnalysisConfig::tuned_tensorrt_dynamic_shape() { +bool AnalysisConfig::tuned_tensorrt_dynamic_shape() const { return trt_tuned_dynamic_shape_; } -bool AnalysisConfig::trt_allow_build_at_runtime() { +bool AnalysisConfig::trt_allow_build_at_runtime() const { return trt_allow_build_at_runtime_; } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc old mode 100755 new mode 100644 index 33b3da0717b..ae34fd52341 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1095,6 +1095,7 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetTensorRtAllowBuildAtRuntime( config_.trt_allow_build_at_runtime()); argument_.SetTensorRtUseInspector(config_.trt_use_inspector_); + argument_.SetTrtEngineMemorySharing(config_.trt_engine_memory_sharing()); } if (config_.dlnne_enabled()) { @@ -2015,6 +2016,13 @@ AnalysisPredictor::~AnalysisPredictor() { memory::Release(place_); } device_contexts_.clear(); + +#ifdef PADDLE_WITH_TENSORRT + if (config_.trt_engine_memory_sharing()) { + inference::Singleton::Global() + .releaseContextMemory(predictor_id_); + } +#endif } std::unique_ptr AnalysisPredictor::Clone(void *stream) { diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 7bf5dc2cfe3..5f75636d854 100755 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -536,6 +536,13 @@ struct PD_INFER_DECL AnalysisConfig { /// bool tensorrt_engine_enabled() const { return use_tensorrt_; } /// + /// \brief A boolean state telling whether the tensorrt engine memory sharing + /// is activated. + /// + /// \return bool Whether the tensorrt engine memory sharing is activated. + /// + bool trt_engine_memory_sharing() const; + /// /// \brief Get the TensorRT engine precision. /// /// \return Precision Get the TensorRT engine precision. @@ -577,13 +584,13 @@ struct PD_INFER_DECL AnalysisConfig { /// \brief A boolean state telling whether to use tuned tensorrt dynamic /// shape. /// - bool tuned_tensorrt_dynamic_shape(); + bool tuned_tensorrt_dynamic_shape() const; /// /// \brief A boolean state telling whether to allow building trt engine at /// runtime. /// - bool trt_allow_build_at_runtime(); + bool trt_allow_build_at_runtime() const; /// /// \brief Set execution stream. If not set a stream will be created @@ -616,14 +623,14 @@ struct PD_INFER_DECL AnalysisConfig { /// /// \return the shape info path. /// - const std::string& shape_range_info_path(); + const std::string& shape_range_info_path() const; /// /// \brief A boolean state telling whether to collect shape info. /// /// \return bool Whether to collect shape info. /// - bool shape_range_info_collected(); + bool shape_range_info_collected() const; /// /// \brief Prevent ops running in Paddle-TRT @@ -1037,6 +1044,7 @@ struct PD_INFER_DECL AnalysisConfig { // memory reuse related. bool enable_memory_optim_{false}; + bool trt_engine_memory_sharing_{false}; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 0cf1d6352c3..9a0bbd0ba0c 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -81,11 +81,55 @@ void TensorRTEngine::InitNetwork() { optim_profiles_[i] = infer_builder_->createOptimizationProfile(); } +nvinfer1::IExecutionContext *TensorRTEngine::context() { + std::unique_lock lock(mutex_); + if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) { + PADDLE_ENFORCE_NOT_NULL( + infer_engine_, + platform::errors::InvalidArgument( + "You should build engine first and then set the context.")); + // We may see trt warning: Profile 0 has been chosen by another + // IExecutionContext... + // It's ok. We will set it later. + nvinfer1::IExecutionContext *infer_context{nullptr}; + if (context_memory_sharing_) { + infer_context = + infer_engine_->createExecutionContextWithoutDeviceMemory(); + } else { + infer_context = infer_engine_->createExecutionContext(); + } + PADDLE_ENFORCE_NOT_NULL( + infer_context, + platform::errors::InvalidArgument( + "TensorRT engine can not build execution context.")); + if (with_dynamic_shape_) { + // need new profile if it's not the first + if (cur_profile_num_ > 0) { + infer_context->setOptimizationProfile(cur_profile_num_); + } + profile_index_[predictor_id_per_thread] = cur_profile_num_; + ++cur_profile_num_; + } + infer_context_[predictor_id_per_thread].reset(infer_context); + } + return infer_context_[predictor_id_per_thread].get(); +} + void TensorRTEngine::Execute(int batch_size, std::vector *buffers, cudaStream_t stream) { freshDeviceId(); auto infer_context = context(); + if (context_memory_sharing_) { + void *context_memory{nullptr}; + context_memory = + inference::Singleton::Global() + .getContextMemory( + predictor_id_per_thread, + phi::GPUPlace(device_id_), + phi::Stream(reinterpret_cast(stream))); + infer_context->setDeviceMemory(context_memory); + } if (!with_dynamic_shape()) { infer_context->enqueue(batch_size, buffers->data(), stream, nullptr); } else { @@ -272,6 +316,12 @@ void TensorRTEngine::FreezeNetwork() { infer_context_.clear(); cur_profile_num_ = 0; } + // for engine context memory sharing + if (context_memory_sharing_) { + inference::Singleton::Global() + .updateContextMemorySize(infer_engine_->getDeviceMemorySize(), + predictor_id_per_thread); + } GetEngineInfo(); } @@ -417,6 +467,55 @@ std::unordered_map return &itensor_map_; } +void TensorRTEngine::Deserialize(const std::string &engine_serialized_data) { + freshDeviceId(); + infer_ptr runtime(createInferRuntime(&logger_)); + + if (use_dla_) { + if (precision_ != AnalysisConfig::Precision::kInt8 && + precision_ != AnalysisConfig::Precision::kHalf) { + LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you " + "set float32, so DLA is not used."; + } else if (runtime->getNbDLACores() == 0) { + LOG(WARNING) + << "TensorRT DLA is set by config, but your device does not have " + "DLA, so DLA is not used."; + } else { + if (dla_core_ < 0 || dla_core_ >= runtime->getNbDLACores()) { + dla_core_ = 0; + LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < " + << runtime->getNbDLACores() << ", but got " << dla_core_ + << ", so use use 0 as default."; + } + runtime->setDLACore(dla_core_); + LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore " + << dla_core_; + } + } + + infer_engine_.reset(runtime->deserializeCudaEngine( + engine_serialized_data.c_str(), engine_serialized_data.size())); + + PADDLE_ENFORCE_NOT_NULL( + infer_engine_, + platform::errors::Fatal( + "Building TRT cuda engine failed when deserializing engine info. " + "Please check:\n1. Your TRT serialization is generated and loaded " + "on the same GPU architecture;\n2. The Paddle Inference version of " + "generating serialization file and doing inference are " + "consistent.")); + + binding_num_ = infer_engine_->getNbBindings(); + // for engine context memory sharing + if (context_memory_sharing_) { + inference::Singleton::Global() + .updateContextMemorySize(infer_engine_->getDeviceMemorySize(), + predictor_id_per_thread); + } + + GetEngineInfo(); +} + void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { runtime_batch_ = batch_size; } diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 209f297a066..034f417ff07 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -16,6 +16,7 @@ limitations under the License. */ #include +#include #include #include #include // NOLINT @@ -37,6 +38,8 @@ limitations under the License. */ #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/stream.h" #include "paddle/utils/any.h" namespace paddle { @@ -171,7 +174,7 @@ class TRTInt8Calibrator; /* * TensorRT Engine. * - * There are two alternative ways to use it, one is to build from a paddle + * There are two alternative ways to use it, one is to build from a paddle * protobuf model, another way is to manually construct the network. */ class TensorRTEngine { @@ -287,51 +290,10 @@ class TensorRTEngine { std::unordered_map* GetITensorMap(); nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } - nvinfer1::IExecutionContext* context() { -#ifndef PADDLE_WITH_TESTING - PADDLE_ENFORCE_GT( - predictor_id_per_thread, - -1, - platform::errors::InvalidArgument( - "thread local var predictor_id_per_thread must be " - "initialized to >= 0, but now predictor_id_per_thread = %d", - predictor_id_per_thread)); -#endif - std::unique_lock lock(mutex_); - if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) { - PADDLE_ENFORCE_NOT_NULL( - infer_engine_, - platform::errors::InvalidArgument( - "You should build engine first and then set the context.")); - // We may see trt warning: Profile 0 has been chosen by another - // IExecutionContext... - // It's ok. We will set it later. - infer_context_[predictor_id_per_thread].reset( - infer_engine_->createExecutionContext()); - if (with_dynamic_shape_) { - // need new profile if it's not the first - if (cur_profile_num_ > 0) { - infer_context_[predictor_id_per_thread]->setOptimizationProfile( - cur_profile_num_); - } - profile_index_[predictor_id_per_thread] = cur_profile_num_; - ++cur_profile_num_; - } - } - return infer_context_[predictor_id_per_thread].get(); - } + nvinfer1::IExecutionContext* context(); int GetProfileIndex() { if (max_profile_num_ > 1) { -#ifndef PADDLE_WITH_TESTING - PADDLE_ENFORCE_GT( - predictor_id_per_thread, - -1, - platform::errors::InvalidArgument( - "thread local var predictor_id_per_thread must be " - "initialized to >= 0, but now predictor_id_per_thread = %d", - predictor_id_per_thread)); -#endif std::unique_lock lock(mutex_); return profile_index_[predictor_id_per_thread]; } else { @@ -350,15 +312,6 @@ class TensorRTEngine { infer_engine_, platform::errors::InvalidArgument( "You should build engine first and then set the context.")); -#ifndef PADDLE_WITH_TESTING - PADDLE_ENFORCE_GT( - predictor_id_per_thread, - -1, - platform::errors::InvalidArgument( - "thread local var predictor_id_per_thread must be " - "initialized to >= 0, but now predictor_id_per_thread = %d", - predictor_id_per_thread)); -#endif std::unique_lock lock(mutex_); infer_context_[predictor_id_per_thread].reset(nullptr); infer_context_.erase(predictor_id_per_thread); @@ -380,47 +333,7 @@ class TensorRTEngine { return ihost_memory_.get(); } - void Deserialize(const std::string& engine_serialized_data) { - freshDeviceId(); - infer_ptr runtime(createInferRuntime(&logger_)); - - if (use_dla_) { - if (precision_ != AnalysisConfig::Precision::kInt8 && - precision_ != AnalysisConfig::Precision::kHalf) { - LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you " - "set float32, so DLA is not used."; - } else if (runtime->getNbDLACores() == 0) { - LOG(WARNING) - << "TensorRT DLA is set by config, but your device does not have " - "DLA, so DLA is not used."; - } else { - if (dla_core_ < 0 || dla_core_ >= runtime->getNbDLACores()) { - dla_core_ = 0; - LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < " - << runtime->getNbDLACores() << ", but got " << dla_core_ - << ", so use use 0 as default."; - } - runtime->setDLACore(dla_core_); - LOG(INFO) << "TensorRT DLA enabled in Deserialize(), DLACore " - << dla_core_; - } - } - - infer_engine_.reset(runtime->deserializeCudaEngine( - engine_serialized_data.c_str(), engine_serialized_data.size())); - - PADDLE_ENFORCE_NOT_NULL( - infer_engine_, - platform::errors::Fatal( - "Building TRT cuda engine failed when deserializing engine info. " - "Please check:\n1. Your TRT serialization is generated and loaded " - "on the same GPU architecture;\n2. The Paddle Inference version of " - "generating serialization file and doing inference are " - "consistent.")); - - binding_num_ = infer_engine_->getNbBindings(); - GetEngineInfo(); - } + void Deserialize(const std::string& engine_serialized_data); void SetRuntimeBatch(size_t batch_size); int GetRuntimeBatch(); @@ -694,6 +607,10 @@ class TensorRTEngine { void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; } void SetScope(const framework::Scope& scope) { scope_ = &scope; } + void SetContextMemorySharing(bool context_memory_sharing) { + context_memory_sharing_ = context_memory_sharing; + } + private: // Each ICudaEngine object is bound to a specific GPU when it is instantiated, // ensure that the thread is associated with the correct device by calling @@ -714,6 +631,9 @@ class TensorRTEngine { // batch size of the current data, will be updated each Executation. int batch_size_{-1}; + // use for engine context memory sharing + bool context_memory_sharing_{false}; + int device_id_; int max_profile_num_{1}; int cur_profile_num_{0}; @@ -791,14 +711,23 @@ class TensorRTEngine { engine__->network()->add##layer__(__VA_ARGS__) class TRTEngineManager { + using PredictorID = int; + using AllocationPtr = phi::Allocator::AllocationPtr; + public: - bool Empty() const { return engines_.size() == 0; } + bool Empty() const { + std::lock_guard lock(mutex_); + return engines_.size() == 0; + } + bool Has(const std::string& name) const { + std::lock_guard lock(mutex_); if (engines_.count(name) == 0) return false; return engines_.at(name).get() != nullptr; } TensorRTEngine* Get(const std::string& name) const { + std::lock_guard lock(mutex_); return engines_.at(name).get(); } @@ -826,17 +755,21 @@ class TRTEngineManager { disable_trt_plugin_fp16, model_precision, logger); + std::lock_guard lock(mutex_); engines_[name].reset(p); return p; } void DeleteAll() { + std::lock_guard lock(mutex_); for (auto& item : engines_) { item.second.reset(nullptr); } + engines_.clear(); } void DeleteKey(const std::string& key) { + std::lock_guard lock(mutex_); auto iter = engines_.find(key); if (iter != engines_.end()) { iter->second.reset(nullptr); @@ -844,7 +777,57 @@ class TRTEngineManager { } } + void updateContextMemorySize(size_t mem_size, PredictorID predictor_id) { + bool size_updated{false}; + + { + std::lock_guard lock(mutex_); + if (max_ctx_mem_size_ < mem_size) { + max_ctx_mem_size_ = mem_size; + size_updated = true; + } + } + + if (size_updated) { + releaseContextMemory(predictor_id); + } + } + + void* getContextMemory(PredictorID predictor_id, + const phi::GPUPlace& place, + const phi::Stream& stream) { + std::lock_guard lock(mutex_); + static auto alignment = getAlignmentSize(place); + if (context_memorys_.count(predictor_id) == 0) { + auto context_memory = + memory::Alloc(place, max_ctx_mem_size_ + alignment, stream); + // context_memory_[predictor_id].reset(context_memory.release()); + context_memorys_[predictor_id] = std::move(context_memory); + } + return getAlignedMemory(context_memorys_[predictor_id]->ptr(), alignment); + } + + void releaseContextMemory(PredictorID predictor_id) { + std::lock_guard lock(mutex_); + if (context_memorys_.count(predictor_id)) { + context_memorys_[predictor_id].reset(nullptr); + context_memorys_.erase(predictor_id); + } + } + private: + size_t getAlignmentSize(const phi::GPUPlace& place) { + const auto& prop = platform::GetDeviceProperties(place.GetDeviceId()); + return prop.textureAlignment; + } + + void* getAlignedMemory(void* addr, size_t alignment) { + return reinterpret_cast(uintptr_t(addr) & (~(alignment - 1))); + } + + mutable std::mutex mutex_; + size_t max_ctx_mem_size_{0}; + std::unordered_map context_memorys_; std::unordered_map> engines_; }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 0f8a3d12062..a795a1aadb5 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -476,12 +476,7 @@ class TensorRTEngineOp : public framework::OperatorBase { std::vector output_maps = Attr>("output_name_mapping"); - int num_inputs = 0; - - num_inputs += runtime_input_names_.size(); - // const int num_bindings = num_inputs + Outputs("Ys").size(); - // std::vector buffers(num_bindings); - // This method returns the total over all profiles. + // Get the total over all profiles const int num_bindings = engine->GetNbBindings(); std::vector buffers(num_bindings, nullptr); -- GitLab