diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index b3bf2b8fb673edb401c5e9463cb524a3f680e9b5..c42101e21a58659b04ce2f171e89c5a82adbe8e8 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -345,6 +345,7 @@ struct NCCLAllReduceOpHandle : public OpHandle {
   }
 
   void Wait(platform::DeviceContext *waited_dev) override {
+    VLOG(3) << "Wait NCCL AllReduce";
     this->dev_ctx_.at(waited_dev->GetPlace())->Wait();
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index e8976ff052b77ebbf417c571175d93237c22850d..e156d5b60e904ab005d8f593b259620033dd4906 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -72,12 +72,12 @@ class ParallelExecutor(unittest.TestCase):
         first_loss = numpy.array(fluid.global_scope().find_var('fetched_var')
                                  .get_lod_tensor_array()[0])
         print first_loss
-        #
-        # for i in xrange(10):
-        #     exe.run([], 'fetched_var')
-        # exe.run([loss.name], 'fetched_var')
-        # last_loss = numpy.array(fluid.global_scope().find_var('fetched_var')
-        #                         .get_lod_tensor_array()[0])
-        #
-        # print first_loss, last_loss
-        # self.assertGreater(first_loss[0], last_loss[0])
+
+        for i in xrange(10):
+            exe.run([], 'fetched_var')
+        exe.run([loss.name], 'fetched_var')
+        last_loss = numpy.array(fluid.global_scope().find_var('fetched_var')
+                                .get_lod_tensor_array()[0])
+
+        print first_loss, last_loss
+        self.assertGreater(first_loss[0], last_loss[0])