diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index cb1b080eea674a06ce06cf15aca58a9be7946294..409cb3fbb919e535f9b66621a59db9bb27f46678 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -813,18 +813,6 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
   fetch_ops.clear();
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<LoDTensorArray>() =
       fetched_data->tensors_;
-  VLOG(3) << "Before Wait";
-  // FIXME:
-  // It could be optimized by using multiple events in an operator.
-  // Manually sync computation during iter.
-  for (auto &s : member_->communication_streams_) {
-    s.second.ctx_->Wait();
-  }
-
-  for (auto &p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
-  VLOG(3) << "Done wait";
 }
 
 void ParallelExecutor::RunOp(