提交 35ebe2ec 编写于 作者: Y Yu Yang

Clean MultiDevicesGraphBuilder

上级 c3c7b7bd
...@@ -89,88 +89,60 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -89,88 +89,60 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
bool is_forwarding = true; bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) { for (auto *op : program.Block(0).AllOps()) {
bool change_forward = false; if (op->Type() == "send") {
// append send op if program is distributed trainer main program.
// always use the first device
CreateSendOp(&result, *op);
} else if (IsScaleLossOp(*op)) {
CreateScaleLossGradOp(&result);
is_forwarding = false;
} else {
CreateComputationalOps(&result, *op);
if (!is_forwarding) { if (!is_forwarding) {
// FIXME(yy): Do not hard code like this // Currently, we assume that once gradient is generated, it can be
if (op->OutputArgumentNames().size() == 1 && // broadcast, and each gradient is only broadcast once. But there are no
op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) { // other cases, for example, we need to adjust the gradient according to
continue; // Drop fill 1. for backward coeff; // the input when we get the gradient, which is not considered at
// present.
for (auto &og : op->OutputArgumentNames()) {
if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
InsertNCCLAllReduceOp(&result, og);
}
}
} }
} }
// append send op if program is distributed trainer main program.
// always use the first device
if (!is_forwarding && op->Type() == "send") {
auto &p = places_[0];
auto *s = local_scopes_[0];
// FIXME(wuyi): send op always copy from GPU 0
result.ops_.emplace_back(new SendOpHandle(*op, s, p));
// Create inputs for output on original place and no ssa output
// is created for send op.
CreateOpHandleIOs(&result, *op, p, 0);
continue;
} }
for (size_t i = 0; i < places_.size(); ++i) { /*
auto &p = places_[i]; Dependency graph has been constructed. However, there are still data
auto *s = local_scopes_[i]; harzaeds need to be handled.
*/
result.ops_.emplace_back(new ComputationOpHandle(*op, s, p)); PolishGraphToSupportDataHazards(&result);
auto *op_handle = result.ops_.back().get();
CreateOpHandleIOs(&result, *op, p, i);
auto var_names = op->OutputArgumentNames();
if (is_forwarding) {
if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
// Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA
auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
#else
auto *communication_dev_ctx =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
#endif
op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
communication_dev_ctx);
result.ops_.emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale /*
// factor. So it does not depend on any other operators. * Only variables should be the leaves of graph.
// VarHandle *loss = GetVarHandle(loss_var_name, place); */
// loss->pending_ops_.emplace_back(op_handle); AddOutputToLeafOps(&result);
// op_handle->inputs_.emplace_back(loss);
CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i); if (VLOG_IS_ON(10)) {
change_forward = true; std::ostringstream sout;
} PrintGraphviz(*graph, sout);
} VLOG(10) << sout.str();
} }
if (change_forward) { return std::unique_ptr<SSAGraph>(graph);
is_forwarding = false; }
}
if (!is_forwarding) { void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
auto var_names = op->OutputArgumentNames(); SSAGraph *result, const std::string &og) const {
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once. But there are no
// other cases, for example, we need to adjust the gradient according to
// the input when we get the gradient, which is not considered at present.
for (auto &og : var_names) {
if (grad_names_.count(og) != 0 &&
og_has_been_broadcast.count(og) == 0) { // is param grad
// Insert NCCL AllReduce Op
og_has_been_broadcast.insert(og);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
result.ops_.emplace_back( result->ops_.emplace_back(
new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_)); new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
auto *op_handle = result.ops_.back().get(); auto *op_handle = result->ops_.back().get();
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i]; auto &p = places_[i];
auto &vars = result.vars_[i][og]; auto &vars = result->vars_[i][og];
if (vars.empty()) { // This device has no data. continue. if (vars.empty()) { // This device has no data. continue.
continue; continue;
} }
...@@ -184,30 +156,72 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -184,30 +156,72 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
#else #else
PADDLE_ENFORCE("Not implemented"); PADDLE_ENFORCE("Not implemented");
#endif #endif
}
bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const {
bool is_pg_once =
grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
if (is_pg_once) {
// Insert NCCL AllReduce Op
og_has_been_broadcast->insert(og);
} }
} return is_pg_once;
} }
}
/* void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
Dependency graph has been constructed. However, there are still data for (size_t i = 0; i < places_.size(); ++i) {
harzaeds need to be handled. // Insert ScaleCost OpHandle
*/ #ifdef PADDLE_WITH_CUDA
PolishGraphToSupportDataHazards(&result); auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]);
#else
auto *communication_dev_ctx =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
#endif
/* auto *op_handle =
* Only variables should be the leaves of graph. new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
*/ places_[i], communication_dev_ctx);
AddOutputToLeafOps(&result); result->ops_.emplace_back(op_handle);
if (VLOG_IS_ON(10)) { // FIXME: Currently ScaleLossGradOp only use device_count as scale
std::ostringstream sout; // factor. So it does not depend on any other operators.
PrintGraphviz(*graph, sout); // VarHandle *loss = GetVarHandle(loss_var_name, place);
VLOG(10) << sout.str(); // loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss);
CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
i);
} }
}
return std::unique_ptr<SSAGraph>(graph); void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
} // namespace details const OpDesc &op) const {
for (size_t scope_idx = 0; scope_idx < places_.size(); ++scope_idx) {
auto p = places_[scope_idx];
auto s = local_scopes_[scope_idx];
result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
CreateOpHandleIOs(result, op, p, scope_idx);
}
}
void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
const OpDesc &op) const {
auto &p = places_[0];
auto *s = local_scopes_[0];
// FIXME(wuyi): send op always copy from GPU 0
result->ops_.emplace_back(new SendOpHandle(op, s, p));
// Create inputs for output on original place and no ssa output
// is created for send op.
CreateOpHandleIOs(result, op, p, 0);
}
bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
// FIXME(yy): Do not hard code like this
return op.OutputArgumentNames().size() == 1 &&
op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -57,6 +57,20 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -57,6 +57,20 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nccl_ctxs_; platform::NCCLContextMap *nccl_ctxs_;
#endif #endif
bool IsScaleLossOp(const OpDesc &op) const;
void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
void CreateComputationalOps(SSAGraph *result, const OpDesc &op) const;
void CreateScaleLossGradOp(SSAGraph *result) const;
bool IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const;
void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册