未验证 提交 b4e44b0a 编写于 作者: Y Yuang Liu 提交者: GitHub

[fleet_executor] Fix overlap hang (#38024)

上级 a9bd6f0c
...@@ -75,6 +75,11 @@ class Carrier final { ...@@ -75,6 +75,11 @@ class Carrier final {
bool IsInit() const; bool IsInit() const;
// NOTE: This mutex will be used in interceptor's RunOps function.
// This mutex is used for avoiding forward ops and backward ops run
// simultaneously, which will lead to a random hang for some sync ops.
std::mutex run;
DISABLE_COPY_AND_ASSIGN(Carrier); DISABLE_COPY_AND_ASSIGN(Carrier);
private: private:
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h" #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
#include "paddle/fluid/distributed/fleet_executor/carrier.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h"
#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/executor_gc_helper.h"
...@@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() { ...@@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
} }
void ComputeInterceptor::RunOps() { void ComputeInterceptor::RunOps() {
Carrier& carrier_instance = Carrier::Instance();
std::unique_lock<std::mutex> lock(carrier_instance.run);
VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the " VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
<< step_ + 1 << " time."; << step_ + 1 << " time.";
for (auto op : node_->ops()) { for (auto op : node_->ops()) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册