diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 1847a4dfa51111f9922ef45b4e9bd443f2758a26..d3122353aff7b1c5a5db779a9d81a7d915f74cfa 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -813,7 +813,7 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
   fetch_ops.clear();
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<LoDTensorArray>() =
       fetched_data->tensors_;
-
+  VLOG(3) << "Before Wait";
   // FIXME:
   // It could be optimized by using multiple events in an operator.
   // Manually sync computation during iter.
@@ -824,6 +824,7 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
   for (auto &p : member_->places_) {
     platform::DeviceContextPool::Instance().Get(p)->Wait();
   }
+  VLOG(3) << "Done wait";
 }
 
 void ParallelExecutor::RunOp(