未验证 提交 96058384 编写于 作者: Z Zhou Wei 提交者: GitHub

fix bug of multicard grad ncclAllReduce (#30554)

cherry-pick #30553
fix bug of multicard grad ncclAllReduce, the gradient accumulater of parameters should be keep order, otherwsie, it will influence multicard ncclAllReduce of grad.
上级 5844dfe4
...@@ -328,9 +328,13 @@ void BasicEngine::Execute() { ...@@ -328,9 +328,13 @@ void BasicEngine::Execute() {
"Cannot find gradient of variable %s", var->Name())); "Cannot find gradient of variable %s", var->Name()));
} }
// leaf_accumulators_ : hooks and accumulate-grad for leaf tensor // leaf_accumulators_ : hooks and accumulate-grad for leaf tensor,
// it should be orderly and not reapeated.
if (var->IsLeafGrad()) { if (var->IsLeafGrad()) {
leaf_accumulators_.insert(iter->second.get()); if (std::find(leaf_accumulators_.begin(), leaf_accumulators_.end(),
iter->second.get()) == leaf_accumulators_.end()) {
leaf_accumulators_.push_back(iter->second.get());
}
if (iter->second->HasInnerVar()) { if (iter->second->HasInnerVar()) {
var = iter->second->InnerVar(); var = iter->second->InnerVar();
......
...@@ -69,7 +69,9 @@ class BasicEngine : public Engine { ...@@ -69,7 +69,9 @@ class BasicEngine : public Engine {
std::vector<std::pair<GradientAccumulator*, std::shared_ptr<VariableWrapper>>> std::vector<std::pair<GradientAccumulator*, std::shared_ptr<VariableWrapper>>>
need_accu_var_list_; need_accu_var_list_;
// leaf_accumulators_ is only for leaf tensor(hooks/accumulate grad) // leaf_accumulators_ is only for leaf tensor(hooks/accumulate grad)
std::unordered_set<GradientAccumulator*> leaf_accumulators_; // It should be orderly and not repeated, because multiple cards must ensure
// that the order of vars is the same.
std::vector<GradientAccumulator*> leaf_accumulators_;
bool retain_graph_; bool retain_graph_;
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册