diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 0f9bc869725d496ac46b8aea704269c141ba6816..f4f5ab6a6f7c87575a9429321dc974ad9b50c7e6 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -816,6 +816,10 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, // FIXME: // It could be optimized by using multiple events in an operator. // Manually sync computation during iter. + for (auto &s : member_->communication_streams_) { + s.second.ctx_->Wait(); + } + for (auto &p : member_->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); }