[new-exec] lazy creating work queue (#43551)

* lazy creating work queue * fix dry_run

[new-exec] lazy creating work queue (#43551)
* lazy creating work queue * fix dry_run
238f82e6 · Leo Chen · GitHub · 39d2c89c · 238f82e6
隐藏空白更改
内联并排

Showing with 34 addition and 6 deletion

paddle/fluid/framework/new_executor/interpretercore.cc paddle/fluid/framework/new_executor/interpretercore.cc +34 -6

未找到文件。
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -63,8 +63,6 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
      stream_analyzer_(place) {
  VLOG(4) << "InterpreterCore(): " << this << " on " << place_;
  is_build_ = false;
-  async_work_queue_.reset(new interpreter::AsyncWorkQueue(
-      kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_));
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (IsInterpretercoreFastGCEnabled()) {
@@ -127,6 +125,17 @@ paddle::framework::FetchList InterpreterCore::Run(
    // add listener before run and is_build=true
    global_scope_->ResetListener();
+    // For the program that only run once, it is no need to
+    // create work_queue, so the async_work_queue_ is created
+    // until the second step run.
+    if (async_work_queue_ == nullptr) {
+      async_work_queue_ = std::make_unique<interpreter::AsyncWorkQueue>(
+          kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_);
+      // prepare for the first time.
+      async_work_queue_->PrepareAtomicDeps(dependecy_count_);
+      async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo());
+    }
    ExecuteInstructionList(vec_instruction_);
  }
@@ -174,6 +183,17 @@ paddle::framework::FetchList InterpreterCore::Run(
    // add listener before run and is_build=true
    global_scope_->ResetListener();
+    // For the program that only run once, it is no need to
+    // create work_queue, so the async_work_queue_ is created
+    // until the second step run.
+    if (async_work_queue_ == nullptr) {
+      async_work_queue_ = std::make_unique<interpreter::AsyncWorkQueue>(
+          kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_);
+      // prepare for the first time.
+      async_work_queue_->PrepareAtomicDeps(dependecy_count_);
+      async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo());
+    }
    ExecuteInstructionList(vec_instruction_);
  }
@@ -343,10 +363,6 @@ void InterpreterCore::Convert(
  if (FLAGS_new_executor_use_inplace && !inplaced) {
    BuildInplace();
  }
-  // prepare for the first time.
-  async_work_queue_->PrepareAtomicDeps(dependecy_count_);
-  async_work_queue_->PrepareAtomicVarRef(vec_meta_info);
 }
 bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
@@ -940,6 +956,18 @@ interpreter::CostInfo InterpreterCore::DryRun(
  interpreter::CostInfo cost_info;
  {
    interpreter::ProfilerGuard(place_, &cost_info);
+    // For the program that only run once, it is no need to
+    // create work_queue, so the async_work_queue_ is created
+    // until the second step run.
+    if (async_work_queue_ == nullptr) {
+      async_work_queue_ = std::make_unique<interpreter::AsyncWorkQueue>(
+          kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_);
+      // prepare for the first time.
+      async_work_queue_->PrepareAtomicDeps(dependecy_count_);
+      async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo());
+    }
    ExecuteInstructionList(vec_instruction_);
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
  }