Wait all thread done

5fa535b7 · Yu Yang · 7bff02b2 · 5fa535b7 · 5fa535b7
Showing with 16 addition and 7 deletion

paddle/fluid/framework/parallel_executor.cc paddle/fluid/framework/parallel_executor.cc +12 -4

paddle/fluid/framework/parallel_executor.h paddle/fluid/framework/parallel_executor.h +4 -3

未找到文件。
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -699,8 +699,11 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
    pending_ops.insert({op, op->inputs_.size()});
  }
+  std::vector<std::future<void>> op_threads;
+  op_threads.reserve(pending_ops.size() + to_run.size());
  for (auto *op : to_run) {
-    RunOp(pending_vars, op);
+    op_threads.emplace_back(RunOp(pending_vars, op));
  }
  while (!pending_ops.empty()) {
@@ -731,15 +734,20 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
    }
    for (auto *op : to_run) {
      pending_ops.erase(op);
-      RunOp(pending_vars, op);
+      op_threads.emplace_back(RunOp(pending_vars, op));
+    }
  }
+  for (auto &t : op_threads) {
+    t.get();  // Join all workers
  }
  fetch_ops.clear();
  *member_->global_scope_->Var(fetched_var_name)->GetMutable<LoDTensorArray>() =
      fetched_data->tensors_;
 }
-void ParallelExecutor::RunOp(
+std::future<void> ParallelExecutor::RunOp(
    std::unordered_map<VarHandleBase *, GuardedBool> &pending_vars,
    OpHandle *op) const {
  std::vector<GuardedBool *> *ready_buffer = new std::vector<GuardedBool *>();
@@ -760,7 +768,7 @@ void ParallelExecutor::RunOp(
      LOG(FATAL) << "Unknown exception catched";
    }
  };
-  member_->pool_.enqueue(op_run);
+  return member_->pool_.enqueue(op_run);
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #pragma once
+#include <future>
 #include <unordered_set>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -81,7 +81,8 @@ class ParallelExecutor {
  void BuildNCCLCommunicator() const;
-  void RunOp(std::unordered_map<VarHandleBase*, GuardedBool>& pending_vars,
+  std::future<void> RunOp(
+      std::unordered_map<VarHandleBase*, GuardedBool>& pending_vars,
      OpHandle* op) const;
  void PolishGraphToSupportDataHarzaeds() const;