[Inference] Optimize memory_optimize pass. (#48476)

* update memory_optimize pass

[Inference] Optimize memory_optimize pass. (#48476)
* update memory_optimize pass
aa892113 · Wilber · GitHub · 93099bb8 · aa892113 · aa892113
10 changed file
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -15,8 +15,11 @@
 #include "paddle/fluid/framework/naive_executor.h"

 #include <string>
+#include <unordered_map>
+#include <unordered_set>

 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/denormal.h"
 #ifdef PADDLE_WITH_MKLDNN
@@ -61,12 +64,31 @@ void NaiveExecutor::Run() {
 #ifdef PADDLE_WITH_INFERENCE_NVTX
    platform::CudaNvtxRangePush(op->Type(), platform::NvtxRangeColor::Green);
 #endif
+
+    // According to reuse table, we share the out tensor's holder.
+    if (reuse_cache_.count(op.get())) {
+      for (auto &it : reuse_cache_[op.get()]) {
+        it.first->ShareBufferWith(*cluster_buffer_[it.second]);
+      }
+    }
+
    op->Run(*scope_, place_);
+
+    // Update the shared_holder so that only records the max one.
+    if (reuse_cache_.count(op.get())) {
+      for (auto &it : reuse_cache_[op.get()]) {
+        if (it.first->memory_size() >
+            cluster_buffer_[it.second]->memory_size()) {
+          cluster_buffer_[it.second] = it.first;
+        }
+      }
+    }
+
 #ifdef PADDLE_WITH_INFERENCE_NVTX
    platform::CudaNvtxRangePop();
 #endif
-    if (hookfunc_) {
-      hookfunc_(op.get());
+    for (auto &func : hookfunc_) {
+      func(op.get());
    }
  }
 #ifdef PADDLE_WITH_INFERENCE_NVTX
@@ -146,7 +168,46 @@ phi::DenseTensor *NaiveExecutor::FindTensor(const std::string &name) {
 }

 void NaiveExecutor::RegisterOutputHook(const HookFunc &hookfunc) {
-  hookfunc_ = hookfunc;
+  hookfunc_.push_back(hookfunc);
+}
+
+void NaiveExecutor::MakeReusePlan(
+    const std::unordered_map<std::string, std::string> &reuse_table) {
+  std::unordered_map<std::string, std::unordered_set<std::string>> clusters;
+  for (auto &it : reuse_table) {
+    clusters[it.second].insert(it.first);
+  }
+
+  std::vector<std::string> cluster_names;
+  for (auto &it : clusters) {
+    cluster_names.push_back(it.first);
+  }
+  cluster_buffer_.resize(cluster_names.size());
+
+  for (auto &op : ops_) {
+    for (auto &name : op->OutputVars(true)) {
+      if (reuse_table.count(name)) {
+        const auto &reuse_name = reuse_table.at(name);
+        auto it =
+            std::find(cluster_names.begin(), cluster_names.end(), reuse_name);
+        int idx = it - cluster_names.begin();
+        auto *var = scope_->FindVar(name);
+        auto *reuse_var = scope_->FindVar(reuse_name);
+        if (var && reuse_var && var->IsType<phi::DenseTensor>() &&
+            reuse_var->IsType<phi::DenseTensor>()) {
+          auto *tensor = var->GetMutable<phi::DenseTensor>();
+          auto *reuse_tensor = reuse_var->GetMutable<phi::DenseTensor>();
+          cluster_buffer_[idx] = reuse_tensor;
+          if (reuse_cache_.count(op.get())) {
+            reuse_cache_[op.get()].emplace(tensor, idx);
+          } else {
+            reuse_cache_[op.get()] =
+                std::unordered_map<phi::DenseTensor *, int>{{tensor, idx}};
+          }
+        }
+      }
+    }
+  }
 }

 NaiveExecutor::~NaiveExecutor() {

--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -17,6 +17,7 @@
 #include <functional>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #include "paddle/fluid/framework/operator.h"
@@ -67,6 +68,9 @@ class NaiveExecutor {

  Scope* GetScope() { return scope_; }

+  void MakeReusePlan(
+      const std::unordered_map<std::string, std::string>& reuse_table);
+
  void ResetTrtOps(int num);

  void RegisterOutputHook(const HookFunc& hookfunc);
@@ -82,7 +86,12 @@ class NaiveExecutor {
  std::vector<std::unique_ptr<OperatorBase>> ops_;
  Scope* scope_{nullptr};

-  HookFunc hookfunc_{nullptr};
+  std::vector<HookFunc> hookfunc_;
+
+  // Record information that tensor_a should ShareBufferWith tensor_b.
+  std::unordered_map<OperatorBase*, std::unordered_map<phi::DenseTensor*, int>>
+      reuse_cache_;
+  std::vector<phi::DenseTensor*> cluster_buffer_;
 };

 }  // namespace framework

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -20,7 +20,7 @@ cc_library(

 cc_library(
  ir_pass_manager
-  SRCS ir_pass_manager.cc
+  SRCS ir_pass_manager.cc pass_result_info.cc
  DEPS graph pass ${INFER_IR_PASSES} analysis_helper)

 cc_library(

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -139,6 +139,7 @@ struct Argument {
  unique_ptr_t field__##_;

  DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int);
+  DECL_ARGUMENT_FIELD(root_predictor_id, RootPredictorID, int);
  // Model path
  DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string);
  // Model specified with program and parameters files.

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -229,6 +229,8 @@ void IRPassManager::CreatePasses(Argument *argument,
                    argument->dlnne_input_shape_dict()));
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));
+    } else if (pass_name == "memory_optimize_pass") {
+      pass->Set("root_predictor_id", new int(argument->root_predictor_id()));
    }
    if (pass_name == "lite_subgraph_pass") {
      bool lite_enable_int8 =

--- a/paddle/fluid/inference/analysis/pass_result_info.cc
+++ b/paddle/fluid/inference/analysis/pass_result_info.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/pass_result_info.h"
--- a/paddle/fluid/inference/analysis/pass_result_info.h
+++ b/paddle/fluid/inference/analysis/pass_result_info.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/phi/core/enforce.h"
+#include "paddle/utils/variant.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class PassResultInfoForRuntime {
+ public:
+  using PassInfo =
+      paddle::variant<std::string,
+                      std::vector<std::string>,
+                      std::unordered_map<std::string, std::string>>;
+
+  static PassResultInfoForRuntime* Instance() {
+    static PassResultInfoForRuntime info;
+    return &info;
+  }
+
+  template <typename T>
+  void Set(int predictor_id, const std::string& pass_name, T infos) {
+    map[predictor_id].emplace(pass_name, infos);
+  }
+
+  template <typename T>
+  T Get(int predictor_id, const std::string& pass_name) {
+    PADDLE_ENFORCE_EQ(
+        map.count(predictor_id) && map[predictor_id].count(pass_name),
+        true,
+        phi::errors::InvalidArgument(
+            "Not find predictor_id %d and pass_name %s",
+            predictor_id,
+            pass_name));
+    return PADDLE_GET_CONST(T, map[predictor_id][pass_name]);
+  }
+
+ private:
+  using PassResultInfoMap =
+      std::unordered_map<int, std::unordered_map<std::string, PassInfo>>;
+  PassResultInfoMap map;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -19,6 +19,7 @@

 #include "glog/logging.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/inference/analysis/pass_result_info.h"
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
@@ -310,7 +311,7 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
  // mapping table.
  if (!argument->enable_memory_optim()) return;
  // Because of pass is a singleton, graph can not be member
-  // variables，otherwise，errors will be caused under multithreading
+  // variables，otherwise, errors will be caused under multithreading
  // conditions.
  auto graph = argument->main_graph_ptr();

@@ -323,7 +324,11 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
  CollectLifeCycle(graph, &lifecycles, sort_kind);
  CollectVarMemorySize(graph, &space_table);
  MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size);
-  UpdateOpDescsByReuse(graph, node2cluster, sort_kind);
+
+  auto* pass_res_info = PassResultInfoForRuntime::Instance();
+  pass_res_info->Set(
+      argument->root_predictor_id(), "memory_optimize_pass", node2cluster);
+
  return;
 }


--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -38,6 +38,7 @@
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/pass_result_info.h"
 #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
@@ -262,6 +263,10 @@ bool AnalysisPredictor::Init(
               "generated.";
  }

+  if (!status_is_cloned_) {
+    root_predictor_id_ = predictor_id_;
+  }
+
  // no matter with or without MKLDNN
  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());

@@ -615,6 +620,15 @@ bool AnalysisPredictor::PrepareExecutor() {
  executor_->Prepare(
      sub_scope_, *inference_program_, 0, config_.use_feed_fetch_ops_);

+  if (config_.enable_memory_optim_) {
+    auto *pass_res_info =
+        inference::analysis::PassResultInfoForRuntime::Instance();
+    auto reuse_table =
+        pass_res_info->Get<std::unordered_map<std::string, std::string>>(
+            root_predictor_id_, "memory_optimize_pass");
+    executor_->MakeReusePlan(reuse_table);
+  }
+
  PADDLE_ENFORCE_NOT_NULL(sub_scope_,
                          platform::errors::PreconditionNotMet(
                              "The sub_scope should not be nullptr."));
@@ -1079,6 +1093,7 @@ void AnalysisPredictor::PrepareArgument() {
  argument_.SetModelFromMemory(config_.model_from_memory_);
  // Analyze inference_program
  argument_.SetPredictorID(predictor_id_);
+  argument_.SetRootPredictorID(root_predictor_id_);
  argument_.SetOptimCacheDir(config_.opt_cache_dir_);
  if (!config_.model_dir().empty()) {
    argument_.SetModelDir(config_.model_dir());
@@ -2114,6 +2129,7 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {
  std::lock_guard<std::mutex> lk(clone_mutex_);
  auto *x = new AnalysisPredictor(config_);
  x->status_is_cloned_ = true;
+  x->root_predictor_id_ = this->root_predictor_id_;
  if (config_.use_external_stream_ && stream == nullptr) {
    PADDLE_THROW(platform::errors::InvalidArgument(
        "config has been configured to use external stream, but the Clone "
@@ -2175,12 +2191,6 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
 }

 void AnalysisPredictor::RegisterOutputHook(const Exp_OutputHookFunc &hookfunc) {
-  if (config_.enable_memory_optim()) {
-    LOG(WARNING) << "If you want to run output hook function, you should "
-                    "use config.EnableMemoryOptim(false) to turn off memory "
-                    "reuse!";
-    return;
-  }
  static std::once_flag register_hook_flag;
  std::call_once(register_hook_flag, [this] {
    executor_->RegisterOutputHook([this](framework::OperatorBase *op) {

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -102,7 +102,6 @@ class AnalysisPredictor : public PaddlePredictor {
  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {
    if (config_.shape_range_info_collected()) {
      config_.SwitchIrOptim(false);
-      config_.EnableMemoryOptim(false);
    }
    predictor_id_ = inference::GetUniqueId();
  }
@@ -518,6 +517,7 @@ class AnalysisPredictor : public PaddlePredictor {
  int need_collect_var_shapes_{-1};  // -1 for default, 0 for false, 1 for true.
  std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
  int predictor_id_;
+  int root_predictor_id_{-1};

 private:
  std::vector<Exp_OutputHookFunc> hookfuncs_;