From 173c22aec27b1ef2160589da2a59c59cd1448eac Mon Sep 17 00:00:00 2001 From: WangXi Date: Thu, 26 Nov 2020 16:25:00 +0800 Subject: [PATCH] optimize fast graph executor (#28962) --- .../fast_threaded_ssa_graph_executor.cc | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 7f1d3c9b340..18f2332b6ef 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -231,6 +231,23 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( OpHandleBase *op_to_run = op_queue.back(); op_queue.pop_back(); + // The Op involves data transfer of multiple devices may block other + // computations emit. For example: + // 1 step, queue=[Share, Allreduce], which Share is high priority + // 2 step, Share exec, pending_op=Grad, queue=[Allreduce, Grad] + // 3 step, Allreduce run with sync. Although Allreduce and Grad do not + // have topo dependency, but Grad must wait for Allreduce to complete + // before scheduling. + // In this scenario, calculation and communication may not overlap. + // Therefore, emit the op in the queue before running multi device op. + if (op_to_run->IsMultiDeviceTransfer()) { + while (!op_queue.empty()) { + OpHandleBase *post_op = op_queue.back(); + op_queue.pop_back(); + RunOpAsync(op_deps, post_op, complete_q); + } + } + if (!RunOp(op_to_run, complete_q, &complete)) { return; } @@ -246,6 +263,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( // first without switching to another thread. if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) { op_queue.push_back(pending_op); + } else if (pending_op->IsMultiDeviceTransfer()) { + // multi device ops should be scheduled prior to computing ops + op_queue.push_front(pending_op); } else { if (op_to_run == nullptr) { op_to_run = pending_op; -- GitLab