From b4e44b0ab289c5f0da72fb6f9083b926c0ccddd9 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Fri, 10 Dec 2021 11:46:50 +0800 Subject: [PATCH] [fleet_executor] Fix overlap hang (#38024) --- paddle/fluid/distributed/fleet_executor/carrier.h | 5 +++++ .../fluid/distributed/fleet_executor/compute_interceptor.cc | 3 +++ 2 files changed, 8 insertions(+) diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h index 0c54201c940..f9411aa73fa 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.h +++ b/paddle/fluid/distributed/fleet_executor/carrier.h @@ -75,6 +75,11 @@ class Carrier final { bool IsInit() const; + // NOTE: This mutex will be used in interceptor's RunOps function. + // This mutex is used for avoiding forward ops and backward ops run + // simultaneously, which will lead to a random hang for some sync ops. + std::mutex run; + DISABLE_COPY_AND_ASSIGN(Carrier); private: diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc index 35905125a0a..98583de84e7 100644 --- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc +++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h" +#include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" #include "paddle/fluid/framework/executor_gc_helper.h" @@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() { } void ComputeInterceptor::RunOps() { + Carrier& carrier_instance = Carrier::Instance(); + std::unique_lock lock(carrier_instance.run); VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the " << step_ + 1 << " time."; for (auto op : node_->ops()) { -- GitLab