diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 409cb3fbb919e535f9b66621a59db9bb27f46678..6408ecdd376494cc9988666f12037f62ce4e183c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -813,6 +813,16 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, fetch_ops.clear(); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetched_data->tensors_; + // FIXME: + // It could be optimized by using multiple events in an operator. + // Manually sync computation during iter. + for (auto &s : member_->communication_streams_) { + s.second.ctx_->Wait(); + } + + for (auto &p : member_->places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } } void ParallelExecutor::RunOp(