From 2fff5a58ba136cd7a16ff76413168ff1ace17c50 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Sat, 18 Sep 2021 10:43:04 +0800
Subject: [PATCH] Clean ParseMemInfo and Fix unittest failed under multi-thread
 (#35840)

* Clean ParaseMemInfo and fix unittest with multi-thread

* fix declare
---
 .../framework/new_executor/interpretercore.cc | 19 +++++----------
 .../framework/new_executor/interpretercore.h  |  5 ++--
 .../fluid/framework/new_executor/profiler.h   | 24 +++----------------
 paddle/fluid/pybind/pybind.cc                 |  6 +----
 .../interpreter/test_standalone_executor.py   |  7 ------
 5 files changed, 12 insertions(+), 49 deletions(-)
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index b8bb6d21ebc..2da9c275c3d 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -324,16 +324,15 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
 }
 
 void InterpreterCore::ExecuteInstructionList(
-    const std::vector<Instruction>& vec_instr, bool is_dry_run) {
+    const std::vector<Instruction>& vec_instr) {
   auto atomic_deps = async_work_queue_.PrepareAtomicDeps(dependecy_count_);
   auto atomic_var_ref = async_work_queue_.PrepareAtomicVarRef(vec_meta_info_);
   std::atomic<size_t> op_run_number{0};
 
   for (size_t i = 0; i < dependecy_count_.size(); ++i) {
     if (dependecy_count_[i] == 0) {
-      async_work_queue_.AddTask(vec_instr[i].type_, [&, i, is_dry_run]() {
-        RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number,
-                            is_dry_run);
+      async_work_queue_.AddTask(vec_instr[i].type_, [&, i]() {
+        RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number);
       });
     }
   }
@@ -350,8 +349,7 @@ void InterpreterCore::ExecuteInstructionList(
 void InterpreterCore::RunInstructionAsync(size_t instr_id,
                                           AtomicVectorSizeT* atomic_deps,
                                           AtomicVectorSizeT* atomic_var_ref,
-                                          std::atomic<size_t>* op_run_number,
-                                          bool is_dry_run) {
+                                          std::atomic<size_t>* op_run_number) {
   auto& instr_node = vec_instruction_[instr_id];
   event_manager_.WaitEvent(instr_node, place_);
 
@@ -360,10 +358,6 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id,
   event_manager_.RecordEvent(instr_node, place_);
   op_run_number->fetch_add(1, std::memory_order_relaxed);
 
-  if (is_dry_run) {
-    dry_run_profiler_.ParseMemoryInfo(global_scope_->var_list);
-  }
-
   auto& next_instr = instr_node.next_instruction_.all_next_ops_;
 
   for (auto next_i : next_instr) {
@@ -372,8 +366,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id,
         atomic_deps->at(next_i)->fetch_sub(1, std::memory_order_relaxed) == 1;
     if (is_ready) {
       async_work_queue_.AddTask(vec_instruction_[next_i].type_, [=]() {
-        RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number,
-                            is_dry_run);
+        RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number);
       });
     }
   }
@@ -433,7 +426,7 @@ const CostInfo& InterpreterCore::DryRun(
   // DryRun may be called many times.
   dry_run_profiler_.Reset();
   dry_run_profiler_.Start();
-  ExecuteInstructionList(vec_instruction_, /*is_dry_run=*/true);
+  ExecuteInstructionList(vec_instruction_);
   platform::DeviceContextPool::Instance().Get(place_)->Wait();
 
   dry_run_profiler_.Pause();
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 371f5bba2af..d7fb6b7fd91 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -59,8 +59,7 @@ class InterpreterCore {
 
   void RunInstruction(const Instruction& instr_node);
 
-  void ExecuteInstructionList(const std::vector<Instruction>& vec_instr,
-                              bool is_dry_run = false);
+  void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
 
   void DryRunPrepare(const std::vector<framework::Tensor>& feed_tensors);
 
@@ -70,7 +69,7 @@ class InterpreterCore {
   void RunInstructionAsync(size_t instr_id,
                            AtomicVectorSizeT* working_dependecy_count,
                            AtomicVectorSizeT* working_var_ref,
-                           std::atomic<size_t>* op_run_number, bool is_dry_run);
+                           std::atomic<size_t>* op_run_number);
   void AddFetch(const std::vector<std::string>& fetch_names);
 
   void BuildSkipShareLoDInfo();
diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h
index 5d8ec05b7f2..77783535b64 100644
--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -64,10 +64,8 @@ static std::pair<size_t, size_t> GetTensorMemorySize(
 }
 
 struct CostInfo {
-  double total_time{0.};                // ms
-  size_t host_memory_bytes{0};          // bytes
-  size_t device_memory_bytes{0};        // bytes
-  size_t device_total_memory_bytes{0};  // total allocated memory size
+  double total_time{0.};          // ms
+  size_t device_memory_bytes{0};  // total allocated memory size
 };
 
 class InterpreterProfiler {
@@ -82,30 +80,14 @@ class InterpreterProfiler {
   void Reset() {
     timer_.Reset();
     cost_info_.total_time = 0.;
-    cost_info_.host_memory_bytes = 0;
     cost_info_.device_memory_bytes = 0;
-    cost_info_.device_total_memory_bytes = 0;
-  }
-
-  void ParseMemoryInfo(const std::vector<Variable*>& vars) {
-    timer_.Start();
-    auto memory_info = GetTensorMemorySize(vars);
-    VLOG(3) << "host memory size: " << memory_info.first;
-    cost_info_.host_memory_bytes =
-        std::max(cost_info_.host_memory_bytes, memory_info.first);
-
-    VLOG(3) << "device memory size: " << memory_info.second;
-    cost_info_.device_memory_bytes =
-        std::max(cost_info_.device_memory_bytes, memory_info.second);
-    timer_.Pause();
-    cost_info_.total_time -= timer_.ElapsedMS();
   }
 
   void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
     if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
-      cost_info_.device_total_memory_bytes =
+      cost_info_.device_memory_bytes =
           platform::RecordedCudaMallocSize(cuda_place.device);
 #endif
     }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5174306d722..e404f27a10d 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1979,12 +1979,8 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<framework::CostInfo>(m, "CostInfo")
       .def(py::init<>())
       .def("total_time", [](CostInfo &self) { return self.total_time; })
-      .def("host_memory_bytes",
-           [](CostInfo &self) { return self.host_memory_bytes; })
       .def("device_memory_bytes",
-           [](CostInfo &self) { return self.device_memory_bytes; })
-      .def("device_total_memory_bytes",
-           [](CostInfo &self) { return self.device_total_memory_bytes; });
+           [](CostInfo &self) { return self.device_memory_bytes; });
 
   py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
       .def(py::init<const platform::Place &, const ProgramDesc &,
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index da335a88e30..425c62ad9e2 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -79,18 +79,11 @@ class LinearTestCase(unittest.TestCase):
         IS_WINDOWS = sys.platform.startswith('win')
 
         if core.is_compiled_with_cuda():
-            # input `a` is on CPU, 16 bytes
-            self.assertEqual(cost_info.host_memory_bytes(), 16)
             # # w,bias,b, out, memory block is at least 256 bytes on Linux
             gt = 16 * 4 if IS_WINDOWS else 256 * 4
             self.assertGreater(cost_info.device_memory_bytes(), gt)
-            self.assertGreaterEqual(cost_info.device_total_memory_bytes(),
-                                    cost_info.device_memory_bytes())
         else:
-            # x(16 bytes), w(16 bytes), bias(8 bytes), b(16 bytes), out(16 bytes)
-            self.assertGreaterEqual(cost_info.host_memory_bytes(), 72)
             self.assertEqual(cost_info.device_memory_bytes(), 0)
-            self.assertGreaterEqual(cost_info.device_total_memory_bytes(), 0)
 
 
 def build_program():
-- 
GitLab