From 572bad8a66ea9650d753911af8383785ed671af9 Mon Sep 17 00:00:00 2001
From: Aurelius84 <liujiezhangbupt@gmail.com>
Date: Tue, 31 Aug 2021 19:06:28 +0800
Subject: [PATCH] Support CostInfo and MemProfiler in InterpreterCore (#34981)

* polish code

* fix unittest on windows

* refine pybind interface

* support statistic MemSize of AllocatorPool

* Replace mutex into atomic
---
 .../framework/new_executor/interpretercore.cc |  52 +++++++-
 .../framework/new_executor/interpretercore.h  |   9 +-
 .../fluid/framework/new_executor/profiler.h   | 121 ++++++++++++++++++
 .../new_executor/standalone_executor.cc       |   9 ++
 .../new_executor/standalone_executor.h        |   3 +
 paddle/fluid/platform/gpu_info.cc             |  20 +--
 paddle/fluid/pybind/pybind.cc                 |  32 ++++-
 .../interpreter/test_standalone_executor.py   |  18 +++
 8 files changed, 247 insertions(+), 17 deletions(-)
 create mode 100644 paddle/fluid/framework/new_executor/profiler.h
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index c86a16cef0..d31627efed 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -349,6 +349,9 @@ void InterpreterCore::BuildInstructionCtx(Instruction* instr_node,
 }
 
 void InterpreterCore::RunInstruction(const Instruction& instr_node) {
+  VLOG(3) << "RunInstruction:  "
+          << instr_node.kernel_func_.operator_base_->Type();
+
   static_cast<const framework::OperatorWithKernel*>(
       instr_node.kernel_func_.operator_base_)
       ->InferShape(instr_node.infershape_ctx_.get());
@@ -364,7 +367,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
 
 void InterpreterCore::ExecuteInstructionList(
     const std::vector<Instruction>& vec_instr, const VariableScope& var_scope,
-    const platform::Place& place) {
+    const platform::Place& place, bool is_dry_run) {
   std::queue<size_t> working_queue;
   auto working_dependecy_count = dependecy_count_;
   for (size_t i = 0; i < dependecy_count_.size(); ++i) {
@@ -385,6 +388,11 @@ void InterpreterCore::ExecuteInstructionList(
     // step2: run instruction
     RunInstruction(instr_node);
     ++run_op_number;
+
+    if (is_dry_run) {
+      profiler_.ParseMemoryInfo(var_scope.var_list);
+    }
+
     // step3: insert event for out_vars if needed
     RecordEventInstruction(instr_node, vec_func_list_[instr_id]);
 
@@ -827,6 +835,48 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
     VLOG(3) << "run " << op_base->Type() << " done.";
   }
 }
+void InterpreterCore::Prepare(
+    const std::vector<framework::Tensor>& feed_tensors) {
+  auto FeedInput = [&] {
+    for (size_t i = 0; i < feed_names_.size(); ++i) {
+      auto it = global_scope_->name2id.find(feed_names_[i]);
+      assert(it != global_scope_->name2id.end());
+
+      auto feed_tensor = global_scope_->var_list[it->second]
+                             ->GetMutable<framework::LoDTensor>();
+      feed_tensor->ShareDataWith(feed_tensors[i]);
+    }
+  };
+
+  if (is_build_ == false) {
+    BuildVariableScope(main_program_, global_scope_);
+    FeedInput();
+    BuildOpFuncList(place_, main_program_, &op_list_, &vec_func_list_,
+                    global_scope_);
+    is_build_ = true;
+    // convert vec func_list to graph
+    Convert();
+  }
+  // NOTE: Because feed_tensor will be GC after BuildOpFuncList, so we should
+  // call
+  // FeedInput again.
+  FeedInput();
+}
+
+const CostInfo& InterpreterCore::DryRun(
+    const std::vector<framework::Tensor>& feed_tensors) {
+  Prepare(feed_tensors);
+  // DryRun may be called many times.
+  profiler_.Reset();
+  profiler_.Start();
+  ExecuteInstructionList(vec_instruction_, *global_scope_, place_,
+                         /*is_dry_run=*/true);
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+
+  profiler_.Pause();
+  profiler_.TotalCUDAAllocatedMemorySize(place_);
+  return profiler_.GetCostInfo();
+}
 
 platform::DeviceContext* InterpreterCore::ParseDeviceContextForInstruction(
     const OpFuncNode& op_func_node, const OperatorBase& op_base) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 652e4fff7c..ebb81dc9a0 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -21,6 +21,7 @@
 
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/fluid/framework/new_executor/profiler.h"
 #include "paddle/fluid/framework/new_executor/workqueue.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -41,6 +42,8 @@ class InterpreterCore {
   paddle::framework::FetchList Run(
       const std::vector<framework::Tensor>& feed_tensors);
 
+  const CostInfo& DryRun(const std::vector<framework::Tensor>& feed_tensors);
+
   static void BuildOpFuncList(const platform::Place& place,
                               const framework::ProgramDesc& pdesc,
                               std::vector<OperatorBase*>* op_list,
@@ -58,7 +61,8 @@ class InterpreterCore {
 
   void ExecuteInstructionList(const std::vector<Instruction>& vec_instr,
                               const VariableScope& var_scope,
-                              const platform::Place& place);
+                              const platform::Place& place,
+                              bool is_dry_run = false);
 
   std::vector<size_t> MergeVector(const std::vector<size_t>& first,
                                   const std::vector<size_t>& second);
@@ -66,6 +70,8 @@ class InterpreterCore {
   void BuildVariableScope(const framework::ProgramDesc& pdesc,
                           VariableScope* var_scope);
 
+  void Prepare(const std::vector<framework::Tensor>& feed_tensors);
+
   void CheckGC(size_t instr_id, const std::vector<size_t>& gc_check_list,
                const VariableScope& var_scope, const platform::Place& place,
                std::vector<VariableMetaInfo>& working_var_ref);  // NOLINT
@@ -100,6 +106,7 @@ class InterpreterCore {
   bool is_build_;
 
   std::vector<std::string> feed_names_;
+  InterpreterProfiler profiler_;
   std::map<size_t, std::shared_ptr<platform::DeviceEvent>> var_id2event_;
 
   std::vector<paddle::platform::DeviceEvent> gc_event_;
diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h
new file mode 100644
index 0000000000..5d8ec05b7f
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -0,0 +1,121 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/timer.h"
+
+namespace paddle {
+namespace framework {
+
+static void GetTensors(Variable* var, std::unordered_set<Tensor*>* tensor_set) {
+  if (var->IsType<LoDTensor>() && var->Get<LoDTensor>().IsInitialized()) {
+    tensor_set->insert(var->GetMutable<LoDTensor>());
+  } else if (var->IsType<SelectedRows>() &&
+             var->Get<SelectedRows>().value().IsInitialized()) {
+    tensor_set->insert(var->GetMutable<SelectedRows>()->mutable_value());
+  } else if (var->IsType<LoDTensorArray>()) {
+    auto* tensor_arr = var->GetMutable<LoDTensorArray>();
+    for (auto& t : *tensor_arr) {
+      if (t.IsInitialized()) {
+        tensor_set->insert(&t);
+      }
+    }
+  }
+}
+
+static std::pair<size_t, size_t> GetTensorMemorySize(
+    const std::vector<Variable*>& var_list) {
+  std::unordered_set<Tensor*> tensor_set;
+  for (auto* var : var_list) {
+    GetTensors(var, &tensor_set);
+  }
+  size_t host_memory_bytes = 0;
+  size_t device_memory_bytes = 0;
+  std::unordered_set<memory::Allocation*> allocation_set;
+  for (auto* tensor : tensor_set) {
+    auto allocation = tensor->Holder().get();
+    if (!allocation_set.count(allocation)) {
+      allocation_set.insert(allocation);
+      if (platform::is_cuda_pinned_place(tensor->place()) ||
+          platform::is_cpu_place(tensor->place())) {
+        VLOG(3) << "found host memory : " << allocation->size();
+        host_memory_bytes += allocation->size();
+      } else {
+        VLOG(3) << "found device memory : " << allocation->size();
+        device_memory_bytes += allocation->size();
+      }
+    }
+  }
+  return {host_memory_bytes, device_memory_bytes};
+}
+
+struct CostInfo {
+  double total_time{0.};                // ms
+  size_t host_memory_bytes{0};          // bytes
+  size_t device_memory_bytes{0};        // bytes
+  size_t device_total_memory_bytes{0};  // total allocated memory size
+};
+
+class InterpreterProfiler {
+ public:
+  void Start() { timer_.Start(); }
+
+  void Pause() {
+    timer_.Pause();
+    cost_info_.total_time += timer_.ElapsedMS();
+  }
+
+  void Reset() {
+    timer_.Reset();
+    cost_info_.total_time = 0.;
+    cost_info_.host_memory_bytes = 0;
+    cost_info_.device_memory_bytes = 0;
+    cost_info_.device_total_memory_bytes = 0;
+  }
+
+  void ParseMemoryInfo(const std::vector<Variable*>& vars) {
+    timer_.Start();
+    auto memory_info = GetTensorMemorySize(vars);
+    VLOG(3) << "host memory size: " << memory_info.first;
+    cost_info_.host_memory_bytes =
+        std::max(cost_info_.host_memory_bytes, memory_info.first);
+
+    VLOG(3) << "device memory size: " << memory_info.second;
+    cost_info_.device_memory_bytes =
+        std::max(cost_info_.device_memory_bytes, memory_info.second);
+    timer_.Pause();
+    cost_info_.total_time -= timer_.ElapsedMS();
+  }
+
+  void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
+    if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
+      cost_info_.device_total_memory_bytes =
+          platform::RecordedCudaMallocSize(cuda_place.device);
+#endif
+    }
+  }
+
+  const CostInfo& GetCostInfo() const { return cost_info_; }
+
+ private:
+  platform::Timer timer_;
+  CostInfo cost_info_;
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 665a99d7d7..2e76a6720a 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -61,6 +61,15 @@ paddle::framework::FetchList StandaloneExecutor::Run(
   return core->Run(feed_tensors);
 }
 
+const CostInfo& StandaloneExecutor::DryRun(
+    const std::vector<std::string>& feed_names,
+    const std::vector<framework::Tensor>& feed_tensors) {
+  auto core = GetInterpreterCore(feed_names, {});
+
+  auto& cost_info = core->DryRun(feed_tensors);
+  return cost_info;
+}
+
 void StandaloneExecutor::BuildVariableOuterScope(
     const framework::ProgramDesc& pdesc, VariableScope* var_scope,
     Scope* outer_scope) {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h
index c8943e2937..600c90e3a1 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -45,6 +45,9 @@ class StandaloneExecutor : public ExecutorBase {
       const std::vector<framework::Tensor>& feed_tensors,
       const std::vector<std::string>& fetch_names);
 
+  const CostInfo& DryRun(const std::vector<std::string>& feed_names,
+                         const std::vector<framework::Tensor>& feed_tensors);
+
  private:
   void BuildVariableOuterScope(const framework::ProgramDesc& pdesc,
                                VariableScope* var_scope, Scope* outer_scope);
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 2e66e3e36d..fda233b3a0 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -499,7 +499,7 @@ class RecordedCudaMallocHelper {
    */
   gpuError_t Malloc(void **ptr, size_t size) {
     LockGuardPtr<std::mutex> lock(mtx_);
-    if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
+    if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
 #ifdef PADDLE_WITH_HIP
       return hipErrorOutOfMemory;
 #else
@@ -514,9 +514,7 @@ class RecordedCudaMallocHelper {
     auto result = cudaMalloc(ptr, size);
 #endif
     if (result == gpuSuccess) {
-      if (NeedRecord()) {
-        cur_size_ += size;
-      }
+      cur_size_.fetch_add(size);
       STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
       return gpuSuccess;
     } else {
@@ -551,10 +549,7 @@ class RecordedCudaMallocHelper {
     if (err != cudaErrorCudartUnloading) {
 #endif
       PADDLE_ENFORCE_CUDA_SUCCESS(err);
-      if (NeedRecord()) {
-        std::lock_guard<std::mutex> guard(*mtx_);
-        cur_size_ -= size;
-      }
+      cur_size_.fetch_sub(size);
       STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
     } else {
 #ifdef PADDLE_WITH_HIP
@@ -582,7 +577,7 @@ class RecordedCudaMallocHelper {
 
     if (NeedRecord()) {
       std::lock_guard<std::mutex> guard(*mtx_);
-      *avail = std::min(*actual_avail, limit_size_ - cur_size_);
+      *avail = std::min(*actual_avail, limit_size_ - cur_size_.load());
       *total = std::min(*actual_total, limit_size_);
       return *total < *actual_total;
     } else {
@@ -594,17 +589,14 @@ class RecordedCudaMallocHelper {
 
   inline bool NeedRecord() const { return limit_size_ != 0; }
 
-  uint64_t RecordedSize() const {
-    LockGuardPtr<std::mutex> lock(mtx_);
-    return NeedRecord() ? cur_size_ : 0;
-  }
+  uint64_t RecordedSize() const { return cur_size_.load(); }
 
   uint64_t LimitSize() const { return limit_size_; }
 
  private:
   const int dev_id_;
   const uint64_t limit_size_;
-  uint64_t cur_size_{0};
+  std::atomic<uint64_t> cur_size_{0};
 
   mutable std::unique_ptr<std::mutex> mtx_;
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 677da35b41..ee4edbbaa0 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1951,6 +1951,16 @@ All parameter, weight, gradient are variables in Paddle.
                  fetch_vars);
       });
 
+  py::class_<framework::CostInfo>(m, "CostInfo")
+      .def(py::init<>())
+      .def("total_time", [](CostInfo &self) { return self.total_time; })
+      .def("host_memory_bytes",
+           [](CostInfo &self) { return self.host_memory_bytes; })
+      .def("device_memory_bytes",
+           [](CostInfo &self) { return self.device_memory_bytes; })
+      .def("device_total_memory_bytes",
+           [](CostInfo &self) { return self.device_total_memory_bytes; });
+
   py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
       .def(py::init<const platform::Place &, const ProgramDesc &,
                     const ProgramDesc &, Scope *>())
@@ -1974,8 +1984,28 @@ All parameter, weight, gradient are variables in Paddle.
                pybind11::gil_scoped_release release;
                ret = self.Run(feed_names, feed_tensors, fetch_names);
              }
-
              return py::cast(std::move(ret));
+           })
+      .def("dry_run",
+           [](StandaloneExecutor &self,
+              const std::unordered_map<std::string, py::array> &input_dict) {
+             std::vector<framework::Tensor> feed_tensors;
+             std::vector<std::string> feed_names;
+
+             for (auto &item : input_dict) {
+               framework::LoDTensor t;
+               SetTensorFromPyArray<platform::CPUPlace>(
+                   &t, item.second, platform::CPUPlace(), false);
+               feed_names.push_back(item.first);
+               feed_tensors.push_back(t);
+             }
+
+             CostInfo cost_info;
+             {
+               pybind11::gil_scoped_release release;
+               cost_info = self.DryRun(feed_names, feed_tensors);
+             }
+             return cost_info;
            });
 
   m.def("init_gflags", framework::InitGflags);
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index 2286765680..b60c3f77e0 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import unittest
 import paddle
 from paddle.fluid import core
@@ -56,6 +57,23 @@ class LinearTestCase(unittest.TestCase):
                     [2, 2], dtype="float32") * i
             }, [a.name, c.name])
 
+        # test for cost_info
+        cost_info = standaloneexecutor.dry_run({
+            "a": np.ones(
+                [2, 2], dtype="float32") * i
+        })
+        self.check_cost_info(cost_info)
+
+    def check_cost_info(self, cost_info):
+        if core.is_compiled_with_cuda():
+            self.assertEqual(cost_info.host_memory_bytes(), 16)
+            self.assertGreater(cost_info.device_memory_bytes(), 0)
+            self.assertGreaterEqual(cost_info.device_total_memory_bytes(),
+                                    cost_info.device_memory_bytes())
+        else:
+            self.assertGreater(cost_info.host_memory_bytes(), 0)
+            self.assertEqual(cost_info.device_memory_bytes(), 0)
+
 
 class MultiStreamModelTestCase(unittest.TestCase):
     def setUp(self):
-- 
GitLab