From a3e771974be044e8a368a0dce3557df61b7d1c47 Mon Sep 17 00:00:00 2001 From: liuyuhui Date: Thu, 29 Apr 2021 22:49:31 +0800 Subject: [PATCH] [Kunlun]fix multi xpu dygraph hang, test=kunlun (#32662) --- paddle/fluid/imperative/reducer.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index a92704ce447..bf479e0d797 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -762,10 +762,11 @@ void Reducer::MarkGroupReady(size_t group_index) { // TODO(liuyuhui): Add try catch to deal with exception later, // otherwise the main thread will continue to run when an exception is // thrown in comm_pool_. - comm_pool_->enqueue([&] { + auto next_group = next_group_; + comm_pool_->enqueue([this, run_order, next_group, &group] { auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; platform::SetXPUDeviceId(dev_id); - FusedAllReduceSchedule(run_order, group, next_group_); + FusedAllReduceSchedule(run_order, group, next_group); { std::lock_guard lock(mutex_); comm_op_count_ -= 1; // lock -- GitLab