Sync all computation streams at the end of run

c18c2f6a · Yu Yang · c372ce28 · c18c2f6a · c18c2f6a
隐藏空白更改
内联并排

Showing with 10 addition and 4 deletion

paddle/fluid/framework/parallel_executor.cc paddle/fluid/framework/parallel_executor.cc +9 -3

paddle/fluid/framework/parallel_executor.h paddle/fluid/framework/parallel_executor.h +1 -1

未找到文件。
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -482,7 +482,6 @@ void ParallelExecutor::ConstructDependencyGraph(
  bool is_forwarding = true;
  for (auto *op : main_program.Block(0).AllOps()) {
    bool change_forward = false;
    if (!is_forwarding) {
      // FIXME(yy): Do not hard code like this
      if (op->OutputArgumentNames().size() == 1 &&
@@ -573,7 +572,7 @@ void ParallelExecutor::ConstructDependencyGraph(
    Dependency graph has been constructed. However, there are still data
    harzaeds need to be handled.
   */
-  PolishGraphToSupportDataHarzaeds();
+  PolishGraphToSupportDataHazards();
 }
 /**
@@ -583,7 +582,7 @@ void ParallelExecutor::ConstructDependencyGraph(
 *
 * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
 */
-void ParallelExecutor::PolishGraphToSupportDataHarzaeds() const {
+void ParallelExecutor::PolishGraphToSupportDataHazards() const {
  for (auto &place_pair : member_->vars_) {
    for (auto &name_pair : place_pair.second) {
      if (name_pair.second.size() <= 1) {
@@ -813,6 +812,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
  fetch_ops.clear();
  *member_->global_scope_->Var(fetched_var_name)->GetMutable<LoDTensorArray>() =
      fetched_data->tensors_;
+  // FIXME:
+  // It could be optimized by using multiple events in an operator.
+  // Manually sync computation during iter.
+  for (auto &p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
 }
 void ParallelExecutor::RunOp(

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -65,7 +65,7 @@ class ParallelExecutor {
      std::unordered_map<VarHandleBase*, std::atomic<bool>>& pending_vars,
      OpHandle* op) const;
-  void PolishGraphToSupportDataHarzaeds() const;
+  void PolishGraphToSupportDataHazards() const;
 };
 }  // namespace framework