diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b3bf2b8fb673edb401c5e9463cb524a3f680e9b5..c42101e21a58659b04ce2f171e89c5a82adbe8e8 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -345,6 +345,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { } void Wait(platform::DeviceContext *waited_dev) override { + VLOG(3) << "Wait NCCL AllReduce"; this->dev_ctx_.at(waited_dev->GetPlace())->Wait(); } }; diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index e8976ff052b77ebbf417c571175d93237c22850d..e156d5b60e904ab005d8f593b259620033dd4906 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -72,12 +72,12 @@ class ParallelExecutor(unittest.TestCase): first_loss = numpy.array(fluid.global_scope().find_var('fetched_var') .get_lod_tensor_array()[0]) print first_loss - # - # for i in xrange(10): - # exe.run([], 'fetched_var') - # exe.run([loss.name], 'fetched_var') - # last_loss = numpy.array(fluid.global_scope().find_var('fetched_var') - # .get_lod_tensor_array()[0]) - # - # print first_loss, last_loss - # self.assertGreater(first_loss[0], last_loss[0]) + + for i in xrange(10): + exe.run([], 'fetched_var') + exe.run([loss.name], 'fetched_var') + last_loss = numpy.array(fluid.global_scope().find_var('fetched_var') + .get_lod_tensor_array()[0]) + + print first_loss, last_loss + self.assertGreater(first_loss[0], last_loss[0])