diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index c83211bab1f3607ebe1854a45db4e5d240836ae4..4386d72044108c11c6886672a0bc5ee1b8e6ef88 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -240,6 +240,10 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int64_t); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); + DECL_ARGUMENT_FIELD(trt_mark_output, TRTMarkOutput, bool); + DECL_ARGUMENT_FIELD(trt_output_tensor_names, + TRTOutputTensorNames, + std::vector); DECL_ARGUMENT_FIELD(tensorrt_disabled_ops, TensorRtDisabledOPs, std::vector); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index b2343d130314eb69b0675a732aabe1bb7e33c008..fca0e1eeabc01d123963a20a741e5d1cd2843859 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -160,6 +160,10 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("min_subgraph_size", new int(argument->tensorrt_min_subgraph_size())); + pass->Set("mark_output", new bool(argument->trt_mark_output())); + pass->Set( + "output_tensor_names", + new std::vector(argument->trt_output_tensor_names())); pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); pass->Set("predictor_id", new int(argument->predictor_id())); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index e5224b9fa5b415aaf3c1382de3ceafc8ee457f15..6b65ccc8b7122e8d23d469e197c861a9f9a84a6d 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -371,6 +371,40 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( // record the origin output data type std::vector origin_outputs_dtype; std::map map_origin_outputs_dtype; + + // Whether to mark Outpus + auto mark_output = Get("mark_output"); + auto output_tensor_name = + Get>("output_tensor_names"); + VLOG(1) << "mark Output: " << mark_output; + + if (mark_output == 1) { + VLOG(1) << "begin to mark output ..."; + for (auto node : subgraph) { + if (node->NodeType() == Node::Type::kOperation) { + if (node->Op()->Outputs().count("Xshape")) continue; + for (auto *x : node->outputs) { + if (std::count(parameters.begin(), parameters.end(), x->Name()) > 0) + continue; + if (!output_tensor_name.empty() && + std::count(output_tensor_name.begin(), + output_tensor_name.end(), + x->Name())) { + VLOG(1) << "output " << x->Name() << " has been marked"; + std::string output_name_withid = + x->Name() + std::to_string(x->id()); + output_names.insert(x->Name()); + output_names_with_id.insert(output_name_withid); + origin_name_output_rank[x->Name()] = x->Var()->GetShape().size(); + trt_outputs.insert(x); + map_origin_outputs_dtype[x->Name()] = + static_cast(x->Var()->GetDataType()); + } + } + } + } + } + for (auto *x : node->outputs) { output_names.insert(x->Name()); output_names_with_id.insert(x->Name() + std::to_string(x->id())); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 75e73d45cb0492feb1f2580d7fadcd06ef74bcaf..c89e52226d57f61be3c5c13cc7a88c4258021c29 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -459,6 +459,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_min_subgraph_size_); CP_MEMBER(tensorrt_precision_mode_); + CP_MEMBER(trt_mark_output_); + CP_MEMBER(trt_output_tensor_names_); CP_MEMBER(trt_disabled_ops_); CP_MEMBER(trt_use_dla_); CP_MEMBER(trt_dla_core_); @@ -757,6 +759,12 @@ void AnalysisConfig::EnableTensorRtEngine(int64_t workspace_size, #endif } +void AnalysisConfig::MarkTrtEngineOutputs( + const std::vector &output_tensor_names) { + trt_mark_output_ = true; + trt_output_tensor_names_ = output_tensor_names; +} + void AnalysisConfig::EnableTensorRTMemoryOptim(bool engine_memory_sharing, int sharing_identifier) { PADDLE_ENFORCE_EQ( @@ -1050,6 +1058,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << tensorrt_workspace_size_; ss << tensorrt_max_batchsize_; ss << tensorrt_min_subgraph_size_; + ss << trt_mark_output_; ss << use_dlnne_; ss << dlnne_min_subgraph_size_; @@ -1331,6 +1340,7 @@ std::string AnalysisConfig::Summary() { } os.InsertRow({"trt_engine_memory_sharing", trt_engine_memory_sharing_ ? "true" : "false"}); + os.InsertRow({"trt_mark_output", trt_mark_output_ ? "true" : "false"}); #endif } } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 63c1035ee157736a2f657c0fadd2b141f071d6f7..2b0fe1dacbe45958f2028123f34c34d7746e6092 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1386,6 +1386,8 @@ void AnalysisPredictor::PrepareArgument() { argument_->SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_->SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_->SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); + argument_->SetTRTMarkOutput(config_.trt_mark_output_); + argument_->SetTRTOutputTensorNames(config_.trt_output_tensor_names_); argument_->SetTensorRtDisabledOPs(config_.trt_disabled_ops_); argument_->SetTensorRtUseDLA(config_.trt_use_dla_); argument_->SetTensorRtDLACore(config_.trt_dla_core_); @@ -2695,8 +2697,7 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) { } void AnalysisPredictor::RegisterInputHook(const InputTensorHookFunc &hookfunc) { - static std::once_flag register_input_hook_flag; - std::call_once(register_input_hook_flag, [this] { + std::call_once(register_input_hook_flag_, [this] { executor_->RegisterInputHook( [this](framework::OperatorBase *op, framework::Scope *scope) { for (auto &input : op->Inputs()) { @@ -2719,8 +2720,7 @@ void AnalysisPredictor::RegisterInputHook(const InputTensorHookFunc &hookfunc) { void AnalysisPredictor::RegisterOutputHook( const OutputTensorHookFunc &hookfunc) { - static std::once_flag register_output_hook_flag; - std::call_once(register_output_hook_flag, [this] { + std::call_once(register_output_hook_flag_, [this] { executor_->RegisterOutputHook( [this](framework::OperatorBase *op, framework::Scope *scope) { for (auto &output : op->Outputs()) { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 4b4c18ee791c3c209a446c750982fbb8ff44a486..beecfc9743b1045a0f3fe6efa84c754d8fed41a7 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -571,6 +571,8 @@ class AnalysisPredictor : public PaddlePredictor { std::map idx2feeds_; std::vector fetches_; std::map idx2fetches_; + std::once_flag register_input_hook_flag_; + std::once_flag register_output_hook_flag_; phi::DataType model_precision_{phi::DataType::FLOAT32}; diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index dfb2f53c7bde1727c9fd616ccf97d45b094f60c7..6655de1c5f1c628800c9b3ff285f9a66505473cc 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -690,6 +690,13 @@ struct PD_INFER_DECL AnalysisConfig { /// bool tensorrt_engine_enabled() const { return use_tensorrt_; } /// + /// \brief Whether to get the intermediate output of TensorRT Engine. + /// + /// \param output_tensor_names The name of the Tensor that needs to be marked + /// + void MarkTrtEngineOutputs( + const std::vector& output_tensor_names = {}); + /// /// \brief Turn on the TensorRT memory optimization. /// /// \param engine_memory_sharing Whether to enable TensorRT memory @@ -1204,6 +1211,8 @@ struct PD_INFER_DECL AnalysisConfig { bool trt_use_cuda_graph_{false}; bool trt_use_varseqlen_{false}; bool trt_with_interleaved_{false}; + bool trt_mark_output_{false}; + std::vector trt_output_tensor_names_{}; std::string tensorrt_transformer_posid_{""}; std::string tensorrt_transformer_maskid_{""}; bool trt_use_dla_{false}; diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 1c49b47faeaaae9878351c43cbb20d75aeeabacb..540b2dfa6be6c7318e394239b4fbab52d3416bef 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -871,6 +871,9 @@ void BindAnalysisConfig(py::module *m) { py::arg("disable_trt_plugin_fp16") = false) .def("tensorrt_dynamic_shape_enabled", &AnalysisConfig::tensorrt_dynamic_shape_enabled) + .def("mark_trt_engine_outputs", + &AnalysisConfig::MarkTrtEngineOutputs, + py::arg("output_tensor_names") = std::vector({})) .def("enable_tensorrt_varseqlen", &AnalysisConfig::EnableVarseqlen) .def("tensorrt_varseqlen_enabled", &AnalysisConfig::tensorrt_varseqlen_enabled) diff --git a/test/cpp/inference/api/CMakeLists.txt b/test/cpp/inference/api/CMakeLists.txt index b56eed1373a33dbfeabc79b6e08b5e139a9c74cb..98d4c4986b579076019e49758619334f75763440 100644 --- a/test/cpp/inference/api/CMakeLists.txt +++ b/test/cpp/inference/api/CMakeLists.txt @@ -978,6 +978,14 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST) paddle_inference_shared ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + inference_analysis_test( + trt_mark_trt_engine_outputs_test + SRCS + trt_mark_trt_engine_outputs_test.cc + EXTRA_DEPS + paddle_inference_shared + ARGS + --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) inference_analysis_test( trt_fc_prelu_test SRCS @@ -1370,6 +1378,8 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST) set_tests_properties(test_trt_dynamic_shape_ernie_fp16_ser_deser PROPERTIES TIMEOUT 300) set_tests_properties(test_trt_dynamic_shape_ernie PROPERTIES TIMEOUT 480) + set_tests_properties(trt_mark_trt_engine_outputs_test PROPERTIES TIMEOUT + 300) endif() if(WITH_MKLDNN) diff --git a/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc b/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9c6a87a6d161a7d102d2ee46d26ca6bd1f89dd49 --- /dev/null +++ b/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gflags/gflags.h" +#include "test/cpp/inference/api/trt_test_helper.h" + +namespace paddle { +namespace inference { + +TEST(TensorRT, mark_trt_engine_outputs) { + std::string model_dir = FLAGS_infer_model + "/resnet50"; + AnalysisConfig config; + config.SetModel(model_dir); + config.EnableUseGpu(100, 0); + config.EnableTensorRtEngine( + 1 << 30, 1, 5, AnalysisConfig::Precision::kFloat32, false, false); + // The name of the tensor that needs to be marked, the default is empty (all + // marks) + std::vector markOutput = {"fc_0.tmp_0", "fc_0.tmp_1"}; + config.MarkTrtEngineOutputs(markOutput); + + std::vector> inputs_all; + auto predictor = CreatePaddlePredictor(config); + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + + std::vector outputs; + for (auto &input : inputs_all) { + ASSERT_TRUE(predictor->Run(input, &outputs)); + predictor->ClearIntermediateTensor(); + } +} + +} // namespace inference +} // namespace paddle