diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d3122353aff7b1c5a5db779a9d81a7d915f74cfa..cb1b080eea674a06ce06cf15aca58a9be7946294 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -420,11 +420,11 @@ struct NCCLAllReduceOpHandle : public OpHandle { } } else { if (events_.size() > 1) { - int dev_id = - boost::get(waited_dev->GetPlace()).device; auto stream = static_cast(waited_dev)->stream(); - PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events_[dev_id], 0)); + for (auto &ev : events_) { + PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); + } } } } diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index c0ec6442de1f0a69958b5c2fed50e34984b025c7..cabb8e769dfcad8401fcfa17d6a43fa5b3656493 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -47,7 +47,7 @@ class ParallelExecutor(unittest.TestCase): dtypes=['float32', 'int64']) img, label = fluid.layers.read_file(reader) hidden = img - for _ in xrange(2): + for _ in xrange(4): hidden = fluid.layers.fc( hidden, size=200, @@ -60,7 +60,7 @@ class ParallelExecutor(unittest.TestCase): adam = fluid.optimizer.Adam() adam.minimize(loss) act_places = [] - for each in [fluid.CUDAPlace(0), fluid.CUDAPlace(1)]: + for each in [fluid.CUDAPlace(0)]: p = fluid.core.Place() p.set_place(each) act_places.append(p)