未验证 提交 96058384 编写于 作者: Z Zhou Wei 提交者: GitHub

fix bug of multicard grad ncclAllReduce (#30554)

cherry-pick #30553
fix bug of multicard grad ncclAllReduce, the gradient accumulater of parameters should be keep order, otherwsie, it will influence multicard ncclAllReduce of grad.
上级 5844dfe4
......@@ -328,9 +328,13 @@ void BasicEngine::Execute() {
"Cannot find gradient of variable %s", var->Name()));
}
// leaf_accumulators_ : hooks and accumulate-grad for leaf tensor
// leaf_accumulators_ : hooks and accumulate-grad for leaf tensor,
// it should be orderly and not reapeated.
if (var->IsLeafGrad()) {
leaf_accumulators_.insert(iter->second.get());
if (std::find(leaf_accumulators_.begin(), leaf_accumulators_.end(),
iter->second.get()) == leaf_accumulators_.end()) {
leaf_accumulators_.push_back(iter->second.get());
}
if (iter->second->HasInnerVar()) {
var = iter->second->InnerVar();
......
......@@ -69,7 +69,9 @@ class BasicEngine : public Engine {
std::vector<std::pair<GradientAccumulator*, std::shared_ptr<VariableWrapper>>>
need_accu_var_list_;
// leaf_accumulators_ is only for leaf tensor(hooks/accumulate grad)
std::unordered_set<GradientAccumulator*> leaf_accumulators_;
// It should be orderly and not repeated, because multiple cards must ensure
// that the order of vars is the same.
std::vector<GradientAccumulator*> leaf_accumulators_;
bool retain_graph_;
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册