Simplify constructor of InterpreterCore (#37072)

* Simplify constructor of InterpreterCore * fix bool * clean code

Simplify constructor of InterpreterCore (#37072)
* Simplify constructor of InterpreterCore * fix bool * clean code
8b2c906a · Aurelius84 · GitHub · 76d2fd1d · 8b2c906a · 8b2c906a
9 changed file
--- a/paddle/fluid/framework/new_executor/event_manager.cc
+++ b/paddle/fluid/framework/new_executor/event_manager.cc
@@ -16,9 +16,8 @@

 namespace paddle {
 namespace framework {
-
-void EventManager::WaitEvent(const Instruction& instruction,
-                             const platform::Place& place) {
+namespace interpreter {
+void WaitEvent(const Instruction& instruction, const platform::Place& place) {
  // If InterpreterCore in on CPUPlace, do nothing.
  if (platform::is_cpu_place(place)) return;

@@ -32,8 +31,7 @@ void EventManager::WaitEvent(const Instruction& instruction,
  }
 }

-void EventManager::RecordEvent(const Instruction& instruction,
-                               const platform::Place& place) {
+void RecordEvent(const Instruction& instruction, const platform::Place& place) {
  // If InterpreterCore in on CPUPlace, do nothing.
  if (platform::is_cpu_place(place)) return;

@@ -43,5 +41,6 @@ void EventManager::RecordEvent(const Instruction& instruction,
  }
 }

+}  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/new_executor/event_manager.h
+++ b/paddle/fluid/framework/new_executor/event_manager.h
@@ -17,14 +17,11 @@

 namespace paddle {
 namespace framework {
+namespace interpreter {
+void RecordEvent(const Instruction& instruction, const platform::Place& place);

-class EventManager {
- public:
-  void RecordEvent(const Instruction& instruction,
-                   const platform::Place& place);
-
-  void WaitEvent(const Instruction& instruction, const platform::Place& place);
-};
+void WaitEvent(const Instruction& instruction, const platform::Place& place);

+}  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -33,9 +33,9 @@ namespace framework {
 // NOTE(Aurelius84): Need a better strategy to determine it.
 static constexpr size_t kHostNumThreads = 4;

-InterpreterCore::InterpreterCore(const platform::Place& place, BlockDesc* block,
-                                 VariableScope* global_scope,
-                                 const std::vector<std::string>& feed_names)
+InterpreterCore::InterpreterCore(const platform::Place& place,
+                                 const BlockDesc& block,
+                                 VariableScope* global_scope)
    : place_(place),
      block_(block),
      global_scope_(global_scope),
@@ -45,8 +45,6 @@ InterpreterCore::InterpreterCore(const platform::Place& place, BlockDesc* block,
      new interpreter::AsyncWorkQueue(kHostNumThreads, &main_thread_blocker_));
  gc_.reset(new InterpreterCoreGarbageCollector());

-  feed_names_ = feed_names;
-
  exception_notifier_ = main_thread_blocker_.RegisterEvent(
      kExceptionCaught, [this]() { return exception_holder_.IsCaught(); });

@@ -65,27 +63,12 @@ InterpreterCore::~InterpreterCore() {
 }

 paddle::framework::FetchList InterpreterCore::Run(
+    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
-  auto FeedInput = [&] {
-    for (size_t i = 0; i < feed_names_.size(); ++i) {
-      auto* feed_var = global_scope_->Var(feed_names_[i]);
-      auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
-      feed_tensor->ShareDataWith(feed_tensors[i]);
-      feed_tensor->set_lod(feed_tensors[i].lod());
-    }
-  };
+  bool is_build = is_build_;
+  Prepare(feed_names, feed_tensors, is_build);

-  if (is_build_ == false) {
-    paddle::framework::interpreter::build_variable_scope(*block_,
-                                                         global_scope_);
-    FeedInput();
-    paddle::framework::interpreter::build_op_func_list(
-        place_, *block_, &vec_func_list_, global_scope_);
-    is_build_ = true;
-    // convert vec func_list to graph
-    Convert();
-  } else {
-    FeedInput();
+  if (is_build) {
    ExecuteInstructionList(vec_instruction_);
  }

@@ -95,9 +78,9 @@ paddle::framework::FetchList InterpreterCore::Run(
 }

 void InterpreterCore::Convert() {
+  auto& vec_meta_info = global_scope_->MutableVecMetaInfo();
  auto var_nums = global_scope_->VarSize();
  input_var2op_info_.resize(var_nums);
-  vec_meta_info_.resize(var_nums);

  auto op_nums = vec_func_list_.size();
  vec_instruction_.reserve(op_nums);
@@ -136,7 +119,7 @@ void InterpreterCore::Convert() {
    gc_check_input_list.erase(last, gc_check_input_list.end());

    for (auto var_id : gc_check_input_list) {
-      vec_meta_info_[var_id].var_ref_count_++;
+      vec_meta_info[var_id].var_ref_count_++;
      instr.AddGCCheckVar(var_id);
    }
  }
@@ -148,7 +131,7 @@ void InterpreterCore::Convert() {
        if (input_var2op_info_.at(id).size() == 0) {
          // output var not be used by any kernel
          vec_instruction_[i].AddGCCheckVar(id);
-          vec_meta_info_[id].var_ref_count_++;
+          vec_meta_info[id].var_ref_count_++;
        }
      }
    }
@@ -180,7 +163,7 @@ void InterpreterCore::Convert() {
  }

  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-    BuildAndCacheInstructionCtx(&vec_instruction_[i], *global_scope_, place_);
+    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
  }

  BuildSkipShareLoDInfo();
@@ -248,16 +231,14 @@ void InterpreterCore::BuildInplace() {
  }
 }

-void InterpreterCore::BuildAndCacheInstructionCtx(
-    Instruction* instr_node, const VariableScope& var_scope,
-    const platform::Place& place) {
+void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
  VariableValueMap ins_map;
  for (auto& var_name_item : instr_node->Inputs()) {
    std::vector<Variable*> input_vars;

    input_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
-      input_vars.emplace_back(var_scope.Var(id));
+      input_vars.emplace_back(global_scope_->Var(id));
    }
    ins_map.emplace(var_name_item.first, std::move(input_vars));
  }
@@ -268,7 +249,7 @@ void InterpreterCore::BuildAndCacheInstructionCtx(

    out_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
-      out_vars.emplace_back(var_scope.Var(id));
+      out_vars.emplace_back(global_scope_->Var(id));
    }
    outs_map.emplace(var_name_item.first, std::move(out_vars));
  }
@@ -359,7 +340,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
 void InterpreterCore::ExecuteInstructionList(
    const std::vector<Instruction>& vec_instr) {
  async_work_queue_->PrepareAtomicDeps(dependecy_count_);
-  async_work_queue_->PrepareAtomicVarRef(vec_meta_info_);
+  async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo());
  op_run_number_ = 0;

  exception_holder_.Clear();
@@ -452,7 +433,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
    auto& instr_node = vec_instruction_.at(instr_id);
    auto* op = instr_node.OpBase();
    platform::RecordEvent instruction_event(op->Type());
-    event_manager_.WaitEvent(instr_node, place_);
+    interpreter::WaitEvent(instr_node, place_);

    try {
      RunInstruction(instr_node);
@@ -479,7 +460,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
      return;
    }

-    event_manager_.RecordEvent(instr_node, place_);
+    interpreter::RecordEvent(instr_node, place_);
    op_run_number_.fetch_add(1, std::memory_order_relaxed);

    // GC infomation
@@ -508,11 +489,18 @@ void InterpreterCore::CheckGC(const Instruction& instr) {
  }
 }

-void InterpreterCore::DryRunPrepare(
-    const std::vector<framework::LoDTensor>& feed_tensors) {
+void InterpreterCore::Prepare(
+    const std::vector<std::string>& feed_names,
+    const std::vector<framework::LoDTensor>& feed_tensors, bool prepare_feed) {
+  PADDLE_ENFORCE_EQ(feed_names.size(), feed_tensors.size(),
+                    platform::errors::PreconditionNotMet(
+                        "Required feed_names.size() == feed_tensors.size(), "
+                        "but received %d != %d",
+                        feed_names.size(), feed_tensors.size()));
+
  auto FeedInput = [&] {
-    for (size_t i = 0; i < feed_names_.size(); ++i) {
-      auto* feed_var = global_scope_->FindVar(feed_names_[i]);
+    for (size_t i = 0; i < feed_names.size(); ++i) {
+      auto* feed_var = global_scope_->FindVar(feed_names[i]);
      PADDLE_ENFORCE_NOT_NULL(feed_var, platform::errors::NotFound(
                                            "feed_var shall not be nullptr."));

@@ -522,35 +510,33 @@ void InterpreterCore::DryRunPrepare(
    }
  };

-  if (is_build_ == false) {
-    paddle::framework::interpreter::build_variable_scope(*block_,
-                                                         global_scope_);
+  if (!is_build_) {
+    paddle::framework::interpreter::build_variable_scope(block_, global_scope_);
    FeedInput();
    paddle::framework::interpreter::build_op_func_list(
-        place_, *block_, &vec_func_list_, global_scope_);
+        place_, block_, &vec_func_list_, global_scope_);
    is_build_ = true;
    // convert vec func_list to graph
    Convert();
  }
  // NOTE: Because feed_tensor will be GC after
  // paddle::framework::build_op_func_list, so we should
-  // call
-  // FeedInput again.
-  FeedInput();
+  // call FeedInput again.
+  if (prepare_feed) FeedInput();
 }

-const CostInfo& InterpreterCore::DryRun(
+interpreter::CostInfo InterpreterCore::DryRun(
+    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
-  DryRunPrepare(feed_tensors);
-  // DryRun may be called many times.
-  dry_run_profiler_.Reset();
-  dry_run_profiler_.Start();
-  ExecuteInstructionList(vec_instruction_);
-  platform::DeviceContextPool::Instance().Get(place_)->Wait();
-
-  dry_run_profiler_.Pause();
-  dry_run_profiler_.TotalCUDAAllocatedMemorySize(place_);
-  return dry_run_profiler_.GetCostInfo();
+  Prepare(feed_names, feed_tensors, true);
+  interpreter::CostInfo cost_info;
+  {
+    interpreter::ProfilerGuard(place_, &cost_info);
+    ExecuteInstructionList(vec_instruction_);
+    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  }
+
+  return cost_info;
 }

 }  // namespace framework

--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -40,23 +40,23 @@ using AtomicVectorSizeT = std::vector<std::unique_ptr<std::atomic<size_t>>>;

 class InterpreterCore {
 public:
-  InterpreterCore(const platform::Place& place, BlockDesc* block,
-                  VariableScope* global_scope,
-                  const std::vector<std::string>& feed_names);
+  InterpreterCore(const platform::Place& place, const BlockDesc& block,
+                  VariableScope* global_scope);

  ~InterpreterCore();

  paddle::framework::FetchList Run(
+      const std::vector<std::string>& feed_names,
      const std::vector<framework::LoDTensor>& feed_tensors);

-  const CostInfo& DryRun(const std::vector<framework::LoDTensor>& feed_tensors);
+  interpreter::CostInfo DryRun(
+      const std::vector<std::string>& feed_names,
+      const std::vector<framework::LoDTensor>& feed_tensors);

 private:
  void Convert();

-  void BuildAndCacheInstructionCtx(Instruction* instr_node,
-                                   const VariableScope& var_scope,
-                                   const platform::Place& place);
+  void BuildAndCacheInstructionCtx(Instruction* instr_node);

  void BuildInplace();

@@ -66,7 +66,9 @@ class InterpreterCore {

  void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);

-  void DryRunPrepare(const std::vector<framework::LoDTensor>& feed_tensors);
+  void Prepare(const std::vector<std::string>& feed_names,
+               const std::vector<framework::LoDTensor>& feed_tensors,
+               bool prepare_feed);

  void CheckGC(const Instruction& instr);

@@ -79,22 +81,17 @@ class InterpreterCore {
  bool is_build_;

  const platform::Place& place_;
-  BlockDesc* block_;             // not owned
+  const BlockDesc& block_;       // not owned
  VariableScope* global_scope_;  // not owned

  std::vector<paddle::framework::OpFuncNode> vec_func_list_;
  std::vector<Instruction> vec_instruction_;  // deconstruct before OpFuncNode

-  InstructionInfo instruction_info_;
  std::vector<size_t> dependecy_count_;
+  std::atomic<size_t> op_run_number_{0};
  std::vector<std::vector<size_t>> input_var2op_info_;
-  std::vector<VariableMetaInfo> vec_meta_info_;

-  std::vector<std::string> feed_names_;
-
-  InterpreterProfiler dry_run_profiler_;
  StreamAnalyzer stream_analyzer_;
-  EventManager event_manager_;
  EventsWaiter main_thread_blocker_;
  std::unique_ptr<interpreter::AsyncWorkQueue> async_work_queue_;
  details::ExceptionHolder exception_holder_;
@@ -102,7 +99,6 @@ class InterpreterCore {

  std::unique_ptr<InterpreterCoreGarbageCollector> gc_;
  std::vector<paddle::platform::DeviceEvent> gc_event_;
-  std::atomic<size_t> op_run_number_{0};
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -607,6 +607,12 @@ class VariableScope : public ScopeBase {
        platform::errors::NotFound("%s not in VariableScope.", name));
  }

+  std::vector<VariableMetaInfo>& MutableVecMetaInfo() { return vec_meta_info_; }
+
+  const std::vector<VariableMetaInfo>& VecMetaInfo() const {
+    return vec_meta_info_;
+  }
+
 private:
  std::vector<Variable*> var_list_;
  std::map<std::string, int> name2id_;

--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -20,84 +20,41 @@

 namespace paddle {
 namespace framework {
-
-static void GetTensors(Variable* var, std::unordered_set<Tensor*>* tensor_set) {
-  if (var->IsType<LoDTensor>() && var->Get<LoDTensor>().IsInitialized()) {
-    tensor_set->insert(var->GetMutable<LoDTensor>());
-  } else if (var->IsType<SelectedRows>() &&
-             var->Get<SelectedRows>().value().IsInitialized()) {
-    tensor_set->insert(var->GetMutable<SelectedRows>()->mutable_value());
-  } else if (var->IsType<LoDTensorArray>()) {
-    auto* tensor_arr = var->GetMutable<LoDTensorArray>();
-    for (auto& t : *tensor_arr) {
-      if (t.IsInitialized()) {
-        tensor_set->insert(&t);
-      }
-    }
-  }
-}
-
-static std::pair<size_t, size_t> GetTensorMemorySize(
-    const std::vector<Variable*>& var_list) {
-  std::unordered_set<Tensor*> tensor_set;
-  for (auto* var : var_list) {
-    GetTensors(var, &tensor_set);
-  }
-  size_t host_memory_bytes = 0;
-  size_t device_memory_bytes = 0;
-  std::unordered_set<memory::Allocation*> allocation_set;
-  for (auto* tensor : tensor_set) {
-    auto allocation = tensor->Holder().get();
-    if (!allocation_set.count(allocation)) {
-      allocation_set.insert(allocation);
-      if (platform::is_cuda_pinned_place(tensor->place()) ||
-          platform::is_cpu_place(tensor->place())) {
-        VLOG(3) << "found host memory : " << allocation->size();
-        host_memory_bytes += allocation->size();
-      } else {
-        VLOG(3) << "found device memory : " << allocation->size();
-        device_memory_bytes += allocation->size();
-      }
-    }
-  }
-  return {host_memory_bytes, device_memory_bytes};
-}
-
+namespace interpreter {
 struct CostInfo {
  double total_time{0.};          // ms
  size_t device_memory_bytes{0};  // total allocated memory size
 };

-class InterpreterProfiler {
+class ProfilerGuard {
 public:
-  void Start() { timer_.Start(); }
-
-  void Pause() {
-    timer_.Pause();
-    cost_info_.total_time += timer_.ElapsedMS();
+  ProfilerGuard(const platform::Place& place, CostInfo* cost_info)
+      : place_(place), cost_info_(cost_info) {
+    timer_.Start();
  }

-  void Reset() {
-    timer_.Reset();
-    cost_info_.total_time = 0.;
-    cost_info_.device_memory_bytes = 0;
+  ~ProfilerGuard() {
+    timer_.Pause();
+    cost_info_->total_time += timer_.ElapsedMS();
+    TotalCUDAAllocatedMemorySize(place_);
  }

+ private:
  void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
    if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
-      cost_info_.device_memory_bytes =
+      cost_info_->device_memory_bytes =
          platform::RecordedCudaMallocSize(cuda_place.device);
 #endif
    }
  }

-  const CostInfo& GetCostInfo() const { return cost_info_; }
-
- private:
+  const platform::Place& place_;
+  CostInfo* cost_info_;
  platform::Timer timer_;
-  CostInfo cost_info_;
 };
+
+}  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -51,16 +51,15 @@ paddle::framework::FetchList StandaloneExecutor::Run(
    const std::vector<std::string>& fetch_names) {
  auto core = GetInterpreterCore(feed_names, fetch_names);

-  return core->Run(feed_tensors);
+  return core->Run(feed_names, feed_tensors);
 }

-const CostInfo& StandaloneExecutor::DryRun(
+framework::interpreter::CostInfo StandaloneExecutor::DryRun(
    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
  auto core = GetInterpreterCore(feed_names, {});

-  auto& cost_info = core->DryRun(feed_tensors);
-  return cost_info;
+  return core->DryRun(feed_names, feed_tensors);
 }

 void StandaloneExecutor::BuildVariableOuterScope(
@@ -102,8 +101,8 @@ std::shared_ptr<InterpreterCore> StandaloneExecutor::GetInterpreterCore(
    auto* block = new_prog->MutableBlock(0);
    interpreter::add_fetch(fetch_names, block);

-    auto core = std::make_shared<InterpreterCore>(place_, block, &global_scope_,
-                                                  feed_names);
+    auto core =
+        std::make_shared<InterpreterCore>(place_, *block, &global_scope_);
    programs_.emplace(oss.str(), new_prog);
    interpretercores_.emplace(oss.str(), core);
    return core;

--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -45,8 +45,9 @@ class StandaloneExecutor : public ExecutorBase {
      const std::vector<framework::LoDTensor>& feed_tensors,
      const std::vector<std::string>& fetch_names);

-  const CostInfo& DryRun(const std::vector<std::string>& feed_names,
-                         const std::vector<framework::LoDTensor>& feed_tensors);
+  framework::interpreter::CostInfo DryRun(
+      const std::vector<std::string>& feed_names,
+      const std::vector<framework::LoDTensor>& feed_tensors);

 private:
  void BuildVariableOuterScope(const framework::ProgramDesc& pdesc,

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2069,11 +2069,13 @@ All parameter, weight, gradient are variables in Paddle.
                 fetch_vars);
      });

-  py::class_<framework::CostInfo>(m, "CostInfo")
+  py::class_<framework::interpreter::CostInfo>(m, "CostInfo")
      .def(py::init<>())
-      .def("total_time", [](CostInfo &self) { return self.total_time; })
-      .def("device_memory_bytes",
-           [](CostInfo &self) { return self.device_memory_bytes; });
+      .def("total_time",
+           [](interpreter::CostInfo &self) { return self.total_time; })
+      .def("device_memory_bytes", [](interpreter::CostInfo &self) {
+        return self.device_memory_bytes;
+      });

  py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
      .def(py::init<const platform::Place &, const ProgramDesc &,
@@ -2134,7 +2136,7 @@ All parameter, weight, gradient are variables in Paddle.
               feed_tensors.push_back(t);
             }

-             CostInfo cost_info;
+             framework::interpreter::CostInfo cost_info;
             {
               pybind11::gil_scoped_release release;
               cost_info = self.DryRun(feed_names, feed_tensors);