Clean ParseMemInfo and Fix unittest failed under multi-thread (#35840)

* Clean ParaseMemInfo and fix unittest with multi-thread * fix declare

Clean ParseMemInfo and Fix unittest failed under multi-thread (#35840)
* Clean ParaseMemInfo and fix unittest with multi-thread * fix declare
2fff5a58 · Aurelius84 · GitHub · e4c2a854 · 2fff5a58 · 2fff5a58
5 changed file
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -324,16 +324,15 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
 }

 void InterpreterCore::ExecuteInstructionList(
-    const std::vector<Instruction>& vec_instr, bool is_dry_run) {
+    const std::vector<Instruction>& vec_instr) {
  auto atomic_deps = async_work_queue_.PrepareAtomicDeps(dependecy_count_);
  auto atomic_var_ref = async_work_queue_.PrepareAtomicVarRef(vec_meta_info_);
  std::atomic<size_t> op_run_number{0};

  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
    if (dependecy_count_[i] == 0) {
-      async_work_queue_.AddTask(vec_instr[i].type_, [&, i, is_dry_run]() {
-        RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number,
-                            is_dry_run);
+      async_work_queue_.AddTask(vec_instr[i].type_, [&, i]() {
+        RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number);
      });
    }
  }
@@ -350,8 +349,7 @@ void InterpreterCore::ExecuteInstructionList(
 void InterpreterCore::RunInstructionAsync(size_t instr_id,
                                          AtomicVectorSizeT* atomic_deps,
                                          AtomicVectorSizeT* atomic_var_ref,
-                                          std::atomic<size_t>* op_run_number,
-                                          bool is_dry_run) {
+                                          std::atomic<size_t>* op_run_number) {
  auto& instr_node = vec_instruction_[instr_id];
  event_manager_.WaitEvent(instr_node, place_);

@@ -360,10 +358,6 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id,
  event_manager_.RecordEvent(instr_node, place_);
  op_run_number->fetch_add(1, std::memory_order_relaxed);

-  if (is_dry_run) {
-    dry_run_profiler_.ParseMemoryInfo(global_scope_->var_list);
-  }
-
  auto& next_instr = instr_node.next_instruction_.all_next_ops_;

  for (auto next_i : next_instr) {
@@ -372,8 +366,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id,
        atomic_deps->at(next_i)->fetch_sub(1, std::memory_order_relaxed) == 1;
    if (is_ready) {
      async_work_queue_.AddTask(vec_instruction_[next_i].type_, [=]() {
-        RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number,
-                            is_dry_run);
+        RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number);
      });
    }
  }
@@ -433,7 +426,7 @@ const CostInfo& InterpreterCore::DryRun(
  // DryRun may be called many times.
  dry_run_profiler_.Reset();
  dry_run_profiler_.Start();
-  ExecuteInstructionList(vec_instruction_, /*is_dry_run=*/true);
+  ExecuteInstructionList(vec_instruction_);
  platform::DeviceContextPool::Instance().Get(place_)->Wait();

  dry_run_profiler_.Pause();

--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -59,8 +59,7 @@ class InterpreterCore {

  void RunInstruction(const Instruction& instr_node);

-  void ExecuteInstructionList(const std::vector<Instruction>& vec_instr,
-                              bool is_dry_run = false);
+  void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);

  void DryRunPrepare(const std::vector<framework::Tensor>& feed_tensors);

@@ -70,7 +69,7 @@ class InterpreterCore {
  void RunInstructionAsync(size_t instr_id,
                           AtomicVectorSizeT* working_dependecy_count,
                           AtomicVectorSizeT* working_var_ref,
-                           std::atomic<size_t>* op_run_number, bool is_dry_run);
+                           std::atomic<size_t>* op_run_number);
  void AddFetch(const std::vector<std::string>& fetch_names);

  void BuildSkipShareLoDInfo();

--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -64,10 +64,8 @@ static std::pair<size_t, size_t> GetTensorMemorySize(
 }

 struct CostInfo {
-  double total_time{0.};                // ms
-  size_t host_memory_bytes{0};          // bytes
-  size_t device_memory_bytes{0};        // bytes
-  size_t device_total_memory_bytes{0};  // total allocated memory size
+  double total_time{0.};          // ms
+  size_t device_memory_bytes{0};  // total allocated memory size
 };

 class InterpreterProfiler {
@@ -82,30 +80,14 @@ class InterpreterProfiler {
  void Reset() {
    timer_.Reset();
    cost_info_.total_time = 0.;
-    cost_info_.host_memory_bytes = 0;
    cost_info_.device_memory_bytes = 0;
-    cost_info_.device_total_memory_bytes = 0;
-  }
-
-  void ParseMemoryInfo(const std::vector<Variable*>& vars) {
-    timer_.Start();
-    auto memory_info = GetTensorMemorySize(vars);
-    VLOG(3) << "host memory size: " << memory_info.first;
-    cost_info_.host_memory_bytes =
-        std::max(cost_info_.host_memory_bytes, memory_info.first);
-
-    VLOG(3) << "device memory size: " << memory_info.second;
-    cost_info_.device_memory_bytes =
-        std::max(cost_info_.device_memory_bytes, memory_info.second);
-    timer_.Pause();
-    cost_info_.total_time -= timer_.ElapsedMS();
  }

  void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
    if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
-      cost_info_.device_total_memory_bytes =
+      cost_info_.device_memory_bytes =
          platform::RecordedCudaMallocSize(cuda_place.device);
 #endif
    }

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1979,12 +1979,8 @@ All parameter, weight, gradient are variables in Paddle.
  py::class_<framework::CostInfo>(m, "CostInfo")
      .def(py::init<>())
      .def("total_time", [](CostInfo &self) { return self.total_time; })
-      .def("host_memory_bytes",
-           [](CostInfo &self) { return self.host_memory_bytes; })
      .def("device_memory_bytes",
-           [](CostInfo &self) { return self.device_memory_bytes; })
-      .def("device_total_memory_bytes",
-           [](CostInfo &self) { return self.device_total_memory_bytes; });
+           [](CostInfo &self) { return self.device_memory_bytes; });

  py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
      .def(py::init<const platform::Place &, const ProgramDesc &,

--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -79,18 +79,11 @@ class LinearTestCase(unittest.TestCase):
        IS_WINDOWS = sys.platform.startswith('win')

        if core.is_compiled_with_cuda():
-            # input `a` is on CPU, 16 bytes
-            self.assertEqual(cost_info.host_memory_bytes(), 16)
            # # w,bias,b, out, memory block is at least 256 bytes on Linux
            gt = 16 * 4 if IS_WINDOWS else 256 * 4
            self.assertGreater(cost_info.device_memory_bytes(), gt)
-            self.assertGreaterEqual(cost_info.device_total_memory_bytes(),
-                                    cost_info.device_memory_bytes())
        else:
-            # x(16 bytes), w(16 bytes), bias(8 bytes), b(16 bytes), out(16 bytes)
-            self.assertGreaterEqual(cost_info.host_memory_bytes(), 72)
            self.assertEqual(cost_info.device_memory_bytes(), 0)
-            self.assertGreaterEqual(cost_info.device_total_memory_bytes(), 0)


 def build_program():