From aa892113395a73f6789508b9ebb2e91213a5450a Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Thu, 1 Dec 2022 19:52:53 +0800
Subject: [PATCH] [Inference] Optimize memory_optimize pass. (#48476)

* update memory_optimize pass
---
 paddle/fluid/framework/naive_executor.cc      | 67 ++++++++++++++++++-
 paddle/fluid/framework/naive_executor.h       | 11 ++-
 .../fluid/inference/analysis/CMakeLists.txt   |  2 +-
 paddle/fluid/inference/analysis/argument.h    |  1 +
 .../inference/analysis/ir_pass_manager.cc     |  2 +
 .../inference/analysis/pass_result_info.cc    | 15 +++++
 .../inference/analysis/pass_result_info.h     | 66 ++++++++++++++++++
 .../analysis/passes/memory_optimize_pass.cc   |  9 ++-
 .../fluid/inference/api/analysis_predictor.cc | 22 ++++--
 .../fluid/inference/api/analysis_predictor.h  |  2 +-
 10 files changed, 183 insertions(+), 14 deletions(-)
 create mode 100644 paddle/fluid/inference/analysis/pass_result_info.cc
 create mode 100644 paddle/fluid/inference/analysis/pass_result_info.h
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 57e9a175b16..6c0daef26ff 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -15,8 +15,11 @@
 #include "paddle/fluid/framework/naive_executor.h"
 
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/denormal.h"
 #ifdef PADDLE_WITH_MKLDNN
@@ -61,12 +64,31 @@ void NaiveExecutor::Run() {
 #ifdef PADDLE_WITH_INFERENCE_NVTX
     platform::CudaNvtxRangePush(op->Type(), platform::NvtxRangeColor::Green);
 #endif
+
+    // According to reuse table, we share the out tensor's holder.
+    if (reuse_cache_.count(op.get())) {
+      for (auto &it : reuse_cache_[op.get()]) {
+        it.first->ShareBufferWith(*cluster_buffer_[it.second]);
+      }
+    }
+
     op->Run(*scope_, place_);
+
+    // Update the shared_holder so that only records the max one.
+    if (reuse_cache_.count(op.get())) {
+      for (auto &it : reuse_cache_[op.get()]) {
+        if (it.first->memory_size() >
+            cluster_buffer_[it.second]->memory_size()) {
+          cluster_buffer_[it.second] = it.first;
+        }
+      }
+    }
+
 #ifdef PADDLE_WITH_INFERENCE_NVTX
     platform::CudaNvtxRangePop();
 #endif
-    if (hookfunc_) {
-      hookfunc_(op.get());
+    for (auto &func : hookfunc_) {
+      func(op.get());
     }
   }
 #ifdef PADDLE_WITH_INFERENCE_NVTX
@@ -146,7 +168,46 @@ phi::DenseTensor *NaiveExecutor::FindTensor(const std::string &name) {
 }
 
 void NaiveExecutor::RegisterOutputHook(const HookFunc &hookfunc) {
-  hookfunc_ = hookfunc;
+  hookfunc_.push_back(hookfunc);
+}
+
+void NaiveExecutor::MakeReusePlan(
+    const std::unordered_map<std::string, std::string> &reuse_table) {
+  std::unordered_map<std::string, std::unordered_set<std::string>> clusters;
+  for (auto &it : reuse_table) {
+    clusters[it.second].insert(it.first);
+  }
+
+  std::vector<std::string> cluster_names;
+  for (auto &it : clusters) {
+    cluster_names.push_back(it.first);
+  }
+  cluster_buffer_.resize(cluster_names.size());
+
+  for (auto &op : ops_) {
+    for (auto &name : op->OutputVars(true)) {
+      if (reuse_table.count(name)) {
+        const auto &reuse_name = reuse_table.at(name);
+        auto it =
+            std::find(cluster_names.begin(), cluster_names.end(), reuse_name);
+        int idx = it - cluster_names.begin();
+        auto *var = scope_->FindVar(name);
+        auto *reuse_var = scope_->FindVar(reuse_name);
+        if (var && reuse_var && var->IsType<phi::DenseTensor>() &&
+            reuse_var->IsType<phi::DenseTensor>()) {
+          auto *tensor = var->GetMutable<phi::DenseTensor>();
+          auto *reuse_tensor = reuse_var->GetMutable<phi::DenseTensor>();
+          cluster_buffer_[idx] = reuse_tensor;
+          if (reuse_cache_.count(op.get())) {
+            reuse_cache_[op.get()].emplace(tensor, idx);
+          } else {
+            reuse_cache_[op.get()] =
+                std::unordered_map<phi::DenseTensor *, int>{{tensor, idx}};
+          }
+        }
+      }
+    }
+  }
 }
 
 NaiveExecutor::~NaiveExecutor() {
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index 882f50b451a..f1a4a036cde 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -17,6 +17,7 @@
 #include <functional>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
@@ -67,6 +68,9 @@ class NaiveExecutor {
 
   Scope* GetScope() { return scope_; }
 
+  void MakeReusePlan(
+      const std::unordered_map<std::string, std::string>& reuse_table);
+
   void ResetTrtOps(int num);
 
   void RegisterOutputHook(const HookFunc& hookfunc);
@@ -82,7 +86,12 @@ class NaiveExecutor {
   std::vector<std::unique_ptr<OperatorBase>> ops_;
   Scope* scope_{nullptr};
 
-  HookFunc hookfunc_{nullptr};
+  std::vector<HookFunc> hookfunc_;
+
+  // Record information that tensor_a should ShareBufferWith tensor_b.
+  std::unordered_map<OperatorBase*, std::unordered_map<phi::DenseTensor*, int>>
+      reuse_cache_;
+  std::vector<phi::DenseTensor*> cluster_buffer_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 10d67c69f13..06c4a55c5c9 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -20,7 +20,7 @@ cc_library(
 
 cc_library(
   ir_pass_manager
-  SRCS ir_pass_manager.cc
+  SRCS ir_pass_manager.cc pass_result_info.cc
   DEPS graph pass ${INFER_IR_PASSES} analysis_helper)
 
 cc_library(
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index fd5ba90eefb..a8d1067c554 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -139,6 +139,7 @@ struct Argument {
   unique_ptr_t field__##_;
 
   DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int);
+  DECL_ARGUMENT_FIELD(root_predictor_id, RootPredictorID, int);
   // Model path
   DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string);
   // Model specified with program and parameters files.
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 862a019da6d..b31f28a6a60 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -229,6 +229,8 @@ void IRPassManager::CreatePasses(Argument *argument,
                     argument->dlnne_input_shape_dict()));
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
+    } else if (pass_name == "memory_optimize_pass") {
+      pass->Set("root_predictor_id", new int(argument->root_predictor_id()));
     }
     if (pass_name == "lite_subgraph_pass") {
       bool lite_enable_int8 =
diff --git a/paddle/fluid/inference/analysis/pass_result_info.cc b/paddle/fluid/inference/analysis/pass_result_info.cc
new file mode 100644
index 00000000000..d22d208588f
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass_result_info.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/pass_result_info.h"
diff --git a/paddle/fluid/inference/analysis/pass_result_info.h b/paddle/fluid/inference/analysis/pass_result_info.h
new file mode 100644
index 00000000000..7e42573e959
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass_result_info.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/phi/core/enforce.h"
+#include "paddle/utils/variant.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class PassResultInfoForRuntime {
+ public:
+  using PassInfo =
+      paddle::variant<std::string,
+                      std::vector<std::string>,
+                      std::unordered_map<std::string, std::string>>;
+
+  static PassResultInfoForRuntime* Instance() {
+    static PassResultInfoForRuntime info;
+    return &info;
+  }
+
+  template <typename T>
+  void Set(int predictor_id, const std::string& pass_name, T infos) {
+    map[predictor_id].emplace(pass_name, infos);
+  }
+
+  template <typename T>
+  T Get(int predictor_id, const std::string& pass_name) {
+    PADDLE_ENFORCE_EQ(
+        map.count(predictor_id) && map[predictor_id].count(pass_name),
+        true,
+        phi::errors::InvalidArgument(
+            "Not find predictor_id %d and pass_name %s",
+            predictor_id,
+            pass_name));
+    return PADDLE_GET_CONST(T, map[predictor_id][pass_name]);
+  }
+
+ private:
+  using PassResultInfoMap =
+      std::unordered_map<int, std::unordered_map<std::string, PassInfo>>;
+  PassResultInfoMap map;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 63aaa7d9796..2ff82986e94 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -19,6 +19,7 @@
 
 #include "glog/logging.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/inference/analysis/pass_result_info.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -310,7 +311,7 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
   // mapping table.
   if (!argument->enable_memory_optim()) return;
   // Because of pass is a singleton, graph can not be member
-  // variables，otherwise，errors will be caused under multithreading
+  // variables，otherwise, errors will be caused under multithreading
   // conditions.
   auto graph = argument->main_graph_ptr();
 
@@ -323,7 +324,11 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
   CollectLifeCycle(graph, &lifecycles, sort_kind);
   CollectVarMemorySize(graph, &space_table);
   MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size);
-  UpdateOpDescsByReuse(graph, node2cluster, sort_kind);
+
+  auto* pass_res_info = PassResultInfoForRuntime::Instance();
+  pass_res_info->Set(
+      argument->root_predictor_id(), "memory_optimize_pass", node2cluster);
+
   return;
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 6a23f11e452..1c27c008d8c 100755
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -38,6 +38,7 @@
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/pass_result_info.h"
 #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
@@ -262,6 +263,10 @@ bool AnalysisPredictor::Init(
                "generated.";
   }
 
+  if (!status_is_cloned_) {
+    root_predictor_id_ = predictor_id_;
+  }
+
   // no matter with or without MKLDNN
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 
@@ -615,6 +620,15 @@ bool AnalysisPredictor::PrepareExecutor() {
   executor_->Prepare(
       sub_scope_, *inference_program_, 0, config_.use_feed_fetch_ops_);
 
+  if (config_.enable_memory_optim_) {
+    auto *pass_res_info =
+        inference::analysis::PassResultInfoForRuntime::Instance();
+    auto reuse_table =
+        pass_res_info->Get<std::unordered_map<std::string, std::string>>(
+            root_predictor_id_, "memory_optimize_pass");
+    executor_->MakeReusePlan(reuse_table);
+  }
+
   PADDLE_ENFORCE_NOT_NULL(sub_scope_,
                           platform::errors::PreconditionNotMet(
                               "The sub_scope should not be nullptr."));
@@ -1079,6 +1093,7 @@ void AnalysisPredictor::PrepareArgument() {
   argument_.SetModelFromMemory(config_.model_from_memory_);
   // Analyze inference_program
   argument_.SetPredictorID(predictor_id_);
+  argument_.SetRootPredictorID(root_predictor_id_);
   argument_.SetOptimCacheDir(config_.opt_cache_dir_);
   if (!config_.model_dir().empty()) {
     argument_.SetModelDir(config_.model_dir());
@@ -2114,6 +2129,7 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {
   std::lock_guard<std::mutex> lk(clone_mutex_);
   auto *x = new AnalysisPredictor(config_);
   x->status_is_cloned_ = true;
+  x->root_predictor_id_ = this->root_predictor_id_;
   if (config_.use_external_stream_ && stream == nullptr) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "config has been configured to use external stream, but the Clone "
@@ -2175,12 +2191,6 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
 }
 
 void AnalysisPredictor::RegisterOutputHook(const Exp_OutputHookFunc &hookfunc) {
-  if (config_.enable_memory_optim()) {
-    LOG(WARNING) << "If you want to run output hook function, you should "
-                    "use config.EnableMemoryOptim(false) to turn off memory "
-                    "reuse!";
-    return;
-  }
   static std::once_flag register_hook_flag;
   std::call_once(register_hook_flag, [this] {
     executor_->RegisterOutputHook([this](framework::OperatorBase *op) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 37d1511fa27..25595d12cb4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -102,7 +102,6 @@ class AnalysisPredictor : public PaddlePredictor {
   explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {
     if (config_.shape_range_info_collected()) {
       config_.SwitchIrOptim(false);
-      config_.EnableMemoryOptim(false);
     }
     predictor_id_ = inference::GetUniqueId();
   }
@@ -518,6 +517,7 @@ class AnalysisPredictor : public PaddlePredictor {
   int need_collect_var_shapes_{-1};  // -1 for default, 0 for false, 1 for true.
   std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
   int predictor_id_;
+  int root_predictor_id_{-1};
 
  private:
   std::vector<Exp_OutputHookFunc> hookfuncs_;
-- 
GitLab