Add MarkTrtEngineOutputs API (#56188)

* [paddle-TRT] support mark output * [fix bug] hook function only call one in different predictor * add api test

Add MarkTrtEngineOutputs API (#56188)
* [paddle-TRT] support mark output * [fix bug] hook function only call one in different predictor * add api test
2abf4326 · ming1753 · GitHub · df445c1c · 2abf4326 · 2abf4326
10 changed file
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -240,6 +240,10 @@ struct Argument {
  DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int64_t);
  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
+  DECL_ARGUMENT_FIELD(trt_mark_output, TRTMarkOutput, bool);
+  DECL_ARGUMENT_FIELD(trt_output_tensor_names,
+                      TRTOutputTensorNames,
+                      std::vector<std::string>);
  DECL_ARGUMENT_FIELD(tensorrt_disabled_ops,
                      TensorRtDisabledOPs,
                      std::vector<std::string>);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -160,6 +160,10 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
      pass->Set("min_subgraph_size",
                new int(argument->tensorrt_min_subgraph_size()));
+      pass->Set("mark_output", new bool(argument->trt_mark_output()));
+      pass->Set(
+          "output_tensor_names",
+          new std::vector<std::string>(argument->trt_output_tensor_names()));
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));
      pass->Set("predictor_id", new int(argument->predictor_id()));

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -371,6 +371,40 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
  // record the origin output data type
  std::vector<int> origin_outputs_dtype;
  std::map<std::string, int> map_origin_outputs_dtype;
+
+  // Whether to mark Outpus
+  auto mark_output = Get<bool>("mark_output");
+  auto output_tensor_name =
+      Get<std::vector<std::string>>("output_tensor_names");
+  VLOG(1) << "mark Output: " << mark_output;
+
+  if (mark_output == 1) {
+    VLOG(1) << "begin to mark output ...";
+    for (auto node : subgraph) {
+      if (node->NodeType() == Node::Type::kOperation) {
+        if (node->Op()->Outputs().count("Xshape")) continue;
+        for (auto *x : node->outputs) {
+          if (std::count(parameters.begin(), parameters.end(), x->Name()) > 0)
+            continue;
+          if (!output_tensor_name.empty() &&
+              std::count(output_tensor_name.begin(),
+                         output_tensor_name.end(),
+                         x->Name())) {
+            VLOG(1) << "output " << x->Name() << " has been marked";
+            std::string output_name_withid =
+                x->Name() + std::to_string(x->id());
+            output_names.insert(x->Name());
+            output_names_with_id.insert(output_name_withid);
+            origin_name_output_rank[x->Name()] = x->Var()->GetShape().size();
+            trt_outputs.insert(x);
+            map_origin_outputs_dtype[x->Name()] =
+                static_cast<int>(x->Var()->GetDataType());
+          }
+        }
+      }
+    }
+  }
+
  for (auto *x : node->outputs) {
    output_names.insert(x->Name());
    output_names_with_id.insert(x->Name() + std::to_string(x->id()));

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -459,6 +459,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(tensorrt_max_batchsize_);
  CP_MEMBER(tensorrt_min_subgraph_size_);
  CP_MEMBER(tensorrt_precision_mode_);
+  CP_MEMBER(trt_mark_output_);
+  CP_MEMBER(trt_output_tensor_names_);
  CP_MEMBER(trt_disabled_ops_);
  CP_MEMBER(trt_use_dla_);
  CP_MEMBER(trt_dla_core_);
@@ -757,6 +759,12 @@ void AnalysisConfig::EnableTensorRtEngine(int64_t workspace_size,
 #endif
 }

+void AnalysisConfig::MarkTrtEngineOutputs(
+    const std::vector<std::string> &output_tensor_names) {
+  trt_mark_output_ = true;
+  trt_output_tensor_names_ = output_tensor_names;
+}
+
 void AnalysisConfig::EnableTensorRTMemoryOptim(bool engine_memory_sharing,
                                               int sharing_identifier) {
  PADDLE_ENFORCE_EQ(
@@ -1050,6 +1058,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << tensorrt_workspace_size_;
  ss << tensorrt_max_batchsize_;
  ss << tensorrt_min_subgraph_size_;
+  ss << trt_mark_output_;

  ss << use_dlnne_;
  ss << dlnne_min_subgraph_size_;
@@ -1331,6 +1340,7 @@ std::string AnalysisConfig::Summary() {
      }
      os.InsertRow({"trt_engine_memory_sharing",
                    trt_engine_memory_sharing_ ? "true" : "false"});
+      os.InsertRow({"trt_mark_output", trt_mark_output_ ? "true" : "false"});
 #endif
    }
  }

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1386,6 +1386,8 @@ void AnalysisPredictor::PrepareArgument() {
    argument_->SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
    argument_->SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
    argument_->SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
+    argument_->SetTRTMarkOutput(config_.trt_mark_output_);
+    argument_->SetTRTOutputTensorNames(config_.trt_output_tensor_names_);
    argument_->SetTensorRtDisabledOPs(config_.trt_disabled_ops_);
    argument_->SetTensorRtUseDLA(config_.trt_use_dla_);
    argument_->SetTensorRtDLACore(config_.trt_dla_core_);
@@ -2695,8 +2697,7 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
 }

 void AnalysisPredictor::RegisterInputHook(const InputTensorHookFunc &hookfunc) {
-  static std::once_flag register_input_hook_flag;
-  std::call_once(register_input_hook_flag, [this] {
+  std::call_once(register_input_hook_flag_, [this] {
    executor_->RegisterInputHook(
        [this](framework::OperatorBase *op, framework::Scope *scope) {
          for (auto &input : op->Inputs()) {
@@ -2719,8 +2720,7 @@ void AnalysisPredictor::RegisterInputHook(const InputTensorHookFunc &hookfunc) {

 void AnalysisPredictor::RegisterOutputHook(
    const OutputTensorHookFunc &hookfunc) {
-  static std::once_flag register_output_hook_flag;
-  std::call_once(register_output_hook_flag, [this] {
+  std::call_once(register_output_hook_flag_, [this] {
    executor_->RegisterOutputHook(
        [this](framework::OperatorBase *op, framework::Scope *scope) {
          for (auto &output : op->Outputs()) {

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -571,6 +571,8 @@ class AnalysisPredictor : public PaddlePredictor {
  std::map<size_t, std::string> idx2feeds_;
  std::vector<framework::OpDesc *> fetches_;
  std::map<size_t, std::string> idx2fetches_;
+  std::once_flag register_input_hook_flag_;
+  std::once_flag register_output_hook_flag_;

  phi::DataType model_precision_{phi::DataType::FLOAT32};


--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -690,6 +690,13 @@ struct PD_INFER_DECL AnalysisConfig {
  ///
  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
  ///
+  /// \brief Whether to get the intermediate output of TensorRT Engine.
+  ///
+  /// \param output_tensor_names The name of the Tensor that needs to be marked
+  ///
+  void MarkTrtEngineOutputs(
+      const std::vector<std::string>& output_tensor_names = {});
+  ///
  /// \brief Turn on the TensorRT memory optimization.
  ///
  /// \param engine_memory_sharing Whether to enable TensorRT memory
@@ -1204,6 +1211,8 @@ struct PD_INFER_DECL AnalysisConfig {
  bool trt_use_cuda_graph_{false};
  bool trt_use_varseqlen_{false};
  bool trt_with_interleaved_{false};
+  bool trt_mark_output_{false};
+  std::vector<std::string> trt_output_tensor_names_{};
  std::string tensorrt_transformer_posid_{""};
  std::string tensorrt_transformer_maskid_{""};
  bool trt_use_dla_{false};

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -871,6 +871,9 @@ void BindAnalysisConfig(py::module *m) {
           py::arg("disable_trt_plugin_fp16") = false)
      .def("tensorrt_dynamic_shape_enabled",
           &AnalysisConfig::tensorrt_dynamic_shape_enabled)
+      .def("mark_trt_engine_outputs",
+           &AnalysisConfig::MarkTrtEngineOutputs,
+           py::arg("output_tensor_names") = std::vector<std::string>({}))
      .def("enable_tensorrt_varseqlen", &AnalysisConfig::EnableVarseqlen)
      .def("tensorrt_varseqlen_enabled",
           &AnalysisConfig::tensorrt_varseqlen_enabled)

--- a/test/cpp/inference/api/CMakeLists.txt
+++ b/test/cpp/inference/api/CMakeLists.txt
@@ -978,6 +978,14 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
      paddle_inference_shared
      ARGS
      --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+    inference_analysis_test(
+      trt_mark_trt_engine_outputs_test
+      SRCS
+      trt_mark_trt_engine_outputs_test.cc
+      EXTRA_DEPS
+      paddle_inference_shared
+      ARGS
+      --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
    inference_analysis_test(
      trt_fc_prelu_test
      SRCS
@@ -1370,6 +1378,8 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
    set_tests_properties(test_trt_dynamic_shape_ernie_fp16_ser_deser
                         PROPERTIES TIMEOUT 300)
    set_tests_properties(test_trt_dynamic_shape_ernie PROPERTIES TIMEOUT 480)
+    set_tests_properties(trt_mark_trt_engine_outputs_test PROPERTIES TIMEOUT
+                                                                     300)
  endif()

  if(WITH_MKLDNN)

--- a/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc
+++ b/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "gflags/gflags.h"
+#include "test/cpp/inference/api/trt_test_helper.h"
+
+namespace paddle {
+namespace inference {
+
+TEST(TensorRT, mark_trt_engine_outputs) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  AnalysisConfig config;
+  config.SetModel(model_dir);
+  config.EnableUseGpu(100, 0);
+  config.EnableTensorRtEngine(
+      1 << 30, 1, 5, AnalysisConfig::Precision::kFloat32, false, false);
+  // The name of the tensor that needs to be marked, the default is empty (all
+  // marks)
+  std::vector<std::string> markOutput = {"fc_0.tmp_0", "fc_0.tmp_1"};
+  config.MarkTrtEngineOutputs(markOutput);
+
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  auto predictor = CreatePaddlePredictor(config);
+  SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+
+  std::vector<PaddleTensor> outputs;
+  for (auto &input : inputs_all) {
+    ASSERT_TRUE(predictor->Run(input, &outputs));
+    predictor->ClearIntermediateTensor();
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle