From db3239273ccba4b974c2e83b28a3fd40c0fa99e6 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Tue, 1 Nov 2022 20:03:44 +0800
Subject: [PATCH] [Paddle Inference] add RegisterOutputHook interface (#47050)

---
 paddle/fluid/framework/naive_executor.cc      | 13 ++---
 paddle/fluid/framework/naive_executor.h       | 19 ++++----
 .../fluid/inference/api/analysis_predictor.cc | 47 +++++++++++++++----
 .../fluid/inference/api/analysis_predictor.h  | 12 +++++
 .../api/analysis_predictor_tester.cc          | 47 +++++++++++++++++++
 paddle/fluid/inference/api/paddle_api.h       | 11 +++++
 .../inference/api/paddle_inference_api.h      | 10 ++++
 paddle/fluid/inference/api/paddle_tensor.h    |  7 +++
 paddle/fluid/pybind/inference_api.cc          |  5 +-
 9 files changed, 145 insertions(+), 26 deletions(-)
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index eb4ad8d0daf..52ed842d74e 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -65,6 +65,9 @@ void NaiveExecutor::Run() {
 #ifdef PADDLE_WITH_INFERENCE_NVTX
     platform::CudaNvtxRangePop();
 #endif
+    if (hookfunc_) {
+      hookfunc_(op.get());
+    }
   }
 #ifdef PADDLE_WITH_INFERENCE_NVTX
   platform::CudaNvtxRangePop();
@@ -142,14 +145,8 @@ phi::DenseTensor *NaiveExecutor::FindTensor(const std::string &name) {
   return tensor;
 }
 
-void NaiveExecutor::CleanFeedFetchOps() {
-  std::vector<std::unique_ptr<OperatorBase>> ops;
-  for (auto &op : ops_) {
-    if (op->Type() != "feed" && op->Type() != "fetch") {
-      ops.emplace_back(std::move(op));
-    }
-  }
-  ops_.swap(ops);
+void NaiveExecutor::RegisterOutputHook(const HookFunc &hookfunc) {
+  hookfunc_ = hookfunc;
 }
 
 NaiveExecutor::~NaiveExecutor() {
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index 8ca3f5997af..882f50b451a 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <functional>
 #include <memory>
 #include <string>
 #include <vector>
@@ -24,10 +25,6 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 
-namespace phi {
-class DenseTensor;
-}  // namespace phi
-
 namespace paddle {
 namespace framework {
 
@@ -40,6 +37,8 @@ class Scope;
 
 class NaiveExecutor {
  public:
+  using HookFunc = std::function<void(OperatorBase*)>;
+
   explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
 
   ~NaiveExecutor();
@@ -66,13 +65,13 @@ class NaiveExecutor {
   // Get an tensor to operating directly, without the need for feed_ops.
   phi::DenseTensor* FindTensor(const std::string& name);
 
-  Scope* scope() { return scope_; }
-
-  void CleanFeedFetchOps();
+  Scope* GetScope() { return scope_; }
 
   void ResetTrtOps(int num);
 
- protected:
+  void RegisterOutputHook(const HookFunc& hookfunc);
+
+ private:
   void CreateOps(const ProgramDesc& desc,
                  int block_id,
                  bool with_feed_fetch_ops);
@@ -81,7 +80,9 @@ class NaiveExecutor {
   const platform::Place place_;
   // Catch the required resource to avoid recreate.
   std::vector<std::unique_ptr<OperatorBase>> ops_;
-  Scope* scope_;
+  Scope* scope_{nullptr};
+
+  HookFunc hookfunc_{nullptr};
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 9197efc2a5e..280427cb4c8 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -32,6 +32,7 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/var_type_traits.h"
@@ -1557,10 +1558,10 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
   if (config_.dist_config().use_dist_model()) {
     scope = scope_.get();
   } else {
-    scope = executor_->scope();
+    scope = executor_->GetScope();
   }
 #else
-  scope = executor_->scope();
+  scope = executor_->GetScope();
 #endif
   PADDLE_ENFORCE_NOT_NULL(
       scope->FindVar(name),
@@ -1612,10 +1613,10 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
   if (config_.dist_config().use_dist_model()) {
     scope = scope_.get();
   } else {
-    scope = executor_->scope();
+    scope = executor_->GetScope();
   }
 #else
-  scope = executor_->scope();
+  scope = executor_->GetScope();
 #endif
   PADDLE_ENFORCE_NOT_NULL(
       scope->FindVar(name),
@@ -1997,7 +1998,7 @@ void AnalysisPredictor::ClearIntermediateTensor() {
   for (auto *var : global_block->AllVars()) {
     if (!IsPersistable(var)) {
       const std::string name = var->Name();
-      auto *variable = executor_->scope()->FindVar(name);
+      auto *variable = executor_->GetScope()->FindVar(name);
       if (variable != nullptr && variable->IsType<phi::DenseTensor>() &&
           name != "feed" && name != "fetch") {
         VLOG(3) << "Clear Intermediate Tensor: " << name;
@@ -2178,6 +2179,33 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
   exe.Run(save_program, scope(), 0, true, true);
 }
 
+void AnalysisPredictor::RegisterOutputHook(const Exp_OutputHookFunc &hookfunc) {
+  if (config_.enable_memory_optim()) {
+    LOG(WARNING) << "If you want to run output hook function, you should "
+                    "use config.EnableMemoryOptim(false) to turn off memory "
+                    "reuse!";
+    return;
+  }
+  static std::once_flag register_hook_flag;
+  std::call_once(register_hook_flag, [this] {
+    executor_->RegisterOutputHook([this](framework::OperatorBase *op) {
+      for (auto &output : op->Outputs()) {
+        for (auto &var_name : output.second) {
+          auto *var = this->sub_scope_->FindVar(var_name);
+          if (!var || !var->IsType<phi::DenseTensor>()) continue;
+          auto dense_tensor = var->Get<phi::DenseTensor>();
+          if (!dense_tensor.initialized()) continue;
+          auto tensor = this->GetOutputTensor(var_name);
+          for (auto &hookfunc : this->hookfuncs_) {
+            hookfunc(op->Type(), var_name, *tensor);
+          }
+        }
+      }
+    });
+  });
+  hookfuncs_.push_back(hookfunc);
+}
+
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
     const AnalysisConfig &config) {
@@ -2371,6 +2399,10 @@ void Predictor::ClearIntermediateTensor() {
 
 uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); }
 
+void Predictor::RegisterOutputHook(const Exp_OutputHookFunc &hookfunc) {
+  predictor_->RegisterOutputHook(hookfunc);
+}
+
 void *Predictor::GetExecStream() const { return predictor_->GetExecStream(); }
 
 int GetNumBytesOfDataType(DataType dtype) {
@@ -2452,10 +2484,9 @@ PredictorPool::PredictorPool(const Config &config, size_t size) {
   for (size_t i = 0; i < size - 1; i++) {
     if (config.tensorrt_engine_enabled()) {
       Config config_tmp(copy_config);
-      preds_.push_back(
-          std::move(std::unique_ptr<Predictor>(new Predictor(config_tmp))));
+      preds_.emplace_back(new Predictor(config_tmp));
     } else {
-      preds_.push_back(std::move(main_pred_->Clone()));
+      preds_.emplace_back(main_pred_->Clone());
     }
   }
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index d1dd921db14..37d1511fa27 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -272,6 +272,16 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   std::string GetSerializedProgram() const override;
 
+  ///
+  /// \brief Register a output hook function to operate the intermediate tensor
+  /// of op output. when using this function, memory reuse should be tured off.
+  /// The hook function signature is void(const std::string&, const
+  /// std::string&, const Tensor&>). Here, the first parameter is op's
+  /// type, the second param is output var name of the op, and the third
+  /// parameter is output tensor with the var name.
+  ///
+  void RegisterOutputHook(const Exp_OutputHookFunc &hookfunc) override;
+
   ///
   /// \brief Initialize mkldnn quantizer and execute mkldnn quantization pass
   ///
@@ -510,6 +520,8 @@ class AnalysisPredictor : public PaddlePredictor {
   int predictor_id_;
 
  private:
+  std::vector<Exp_OutputHookFunc> hookfuncs_;
+
   // Some status here that help to determine the status inside the predictor.
   bool status_is_cloned_{false};
 
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 8856ceb61a7..5cba8f06ab9 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -611,4 +611,51 @@ TEST(Predictor, Streams) {
 }
 #endif
 
+TEST(AnalysisPredictor, OutputHookFunc) {
+  auto hookfunc = [](const std::string& type,
+                     const std::string& var_name,
+                     const Tensor& tensor) { LOG(INFO) << "in hook function"; };
+
+  {
+    Config config;
+    config.SetModel(FLAGS_dirname);
+    config.EnableUseGpu(100, 0);
+
+    auto predictor = CreatePredictor(config);
+
+    predictor->RegisterOutputHook(hookfunc);
+    auto w0 = predictor->GetInputHandle("firstw");
+    auto w1 = predictor->GetInputHandle("secondw");
+    auto w2 = predictor->GetInputHandle("thirdw");
+    auto w3 = predictor->GetInputHandle("forthw");
+    w0->Reshape({4, 1});
+    w1->Reshape({4, 1});
+    w2->Reshape({4, 1});
+    w3->Reshape({4, 1});
+    auto* w0_data = w0->mutable_data<int64_t>(PlaceType::kCPU);
+    auto* w1_data = w1->mutable_data<int64_t>(PlaceType::kCPU);
+    auto* w2_data = w2->mutable_data<int64_t>(PlaceType::kCPU);
+    auto* w3_data = w3->mutable_data<int64_t>(PlaceType::kCPU);
+    for (int i = 0; i < 4; i++) {
+      w0_data[i] = i;
+      w1_data[i] = i;
+      w2_data[i] = i;
+      w3_data[i] = i;
+    }
+    predictor->Run();
+    predictor->TryShrinkMemory();
+  }
+
+  {
+    Config config;
+    config.SetModel(FLAGS_dirname);
+    config.EnableMemoryOptim();
+    config.EnableUseGpu(100, 0);
+
+    auto predictor = CreatePredictor(config);
+
+    predictor->RegisterOutputHook(hookfunc);
+  }
+}
+
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index ffb634ce829..ff1ec1eba30 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -38,6 +38,7 @@ namespace paddle {
 using PaddleDType = paddle_infer::DataType;
 using PaddlePlace = paddle_infer::PlaceType;
 using PaddleDataLayout = paddle_infer::DataLayout;
+using paddle_infer::Exp_OutputHookFunc;
 
 /// \brief Memory manager for PaddleTensor.
 ///
@@ -289,6 +290,16 @@ class PD_INFER_DECL PaddlePredictor {
   ///
   virtual uint64_t TryShrinkMemory() { return 0; }
 
+  ///
+  /// \brief Register a output hook function to operate the intermediate tensor
+  /// of op output. when using this function, memory reuse should be tured off.
+  /// The hook function signature is void(const std::string&, const
+  /// std::string&, const Tensor&>). Here, the first parameter is op's
+  /// type, the second param is output var name of the op, and the third
+  /// parameter is output tensor with the var name.
+  ///
+  virtual void RegisterOutputHook(const Exp_OutputHookFunc& hookfunc) {}
+
   /// \brief Clone an existing predictor
   /// When using clone, the same network will be created,
   /// and the parameters between them are shared.
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 055cf3a13fb..1a52c011b2a 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -157,6 +157,16 @@ class PD_INFER_DECL Predictor {
   ///
   uint64_t TryShrinkMemory();
 
+  ///
+  /// \brief Register a output hook function to operate the intermediate tensor
+  /// of op output. when using this function, memory reuse should be tured off.
+  /// The hook function signature is void(const std::string&, const
+  /// std::string&, const Tensor&>). Here, the first parameter is op's
+  /// type, the second param is output var name of the op, and the third
+  /// parameter is output tensor with the var name.
+  ///
+  void RegisterOutputHook(const Exp_OutputHookFunc& hookfunc);
+
   ///
   /// \brief Get the execution stream on devices with a concept of stream,
   /// otherwise returns nullptr.
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index b10f051d6e4..9bc95f251eb 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -14,7 +14,10 @@
 
 #pragma once
 
+#include <functional>
+#include <memory>
 #include <string>
+#include <vector>
 
 #include "paddle_infer_declare.h"  // NOLINT
 
@@ -29,6 +32,10 @@ namespace paddle_infer {
 /// Strings for text data.
 using Strings = std::vector<std::string>;
 
+class Tensor;
+using Exp_OutputHookFunc =
+    std::function<void(const std::string&, const std::string&, const Tensor&)>;
+
 typedef void (*CallbackFunc)(void*);
 
 #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 5d2a5799078..9b99cad8693 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/pybind/inference_api.h"
 
+#include <pybind11/functional.h>
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 
@@ -946,7 +947,9 @@ void BindPaddleInferPredictor(py::module *m) {
 #endif
       .def("try_shrink_memory", &paddle_infer::Predictor::TryShrinkMemory)
       .def("clear_intermediate_tensor",
-           &paddle_infer::Predictor::ClearIntermediateTensor);
+           &paddle_infer::Predictor::ClearIntermediateTensor)
+      .def("register_output_hook",
+           &paddle_infer::Predictor::RegisterOutputHook);
 }
 
 void BindZeroCopyTensor(py::module *m) {
-- 
GitLab