From 9605838413d0d96657fbb0691a72f3c186dd0ac1 Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Tue, 19 Jan 2021 21:15:25 +0800 Subject: [PATCH] fix bug of multicard grad ncclAllReduce (#30554) cherry-pick #30553 fix bug of multicard grad ncclAllReduce, the gradient accumulater of parameters should be keep order, otherwsie, it will influence multicard ncclAllReduce of grad. --- paddle/fluid/imperative/basic_engine.cc | 8 ++++++-- paddle/fluid/imperative/basic_engine.h | 4 +++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index df8c9f0bccb..94d66c5d080 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -328,9 +328,13 @@ void BasicEngine::Execute() { "Cannot find gradient of variable %s", var->Name())); } - // leaf_accumulators_ : hooks and accumulate-grad for leaf tensor + // leaf_accumulators_ : hooks and accumulate-grad for leaf tensor, + // it should be orderly and not reapeated. if (var->IsLeafGrad()) { - leaf_accumulators_.insert(iter->second.get()); + if (std::find(leaf_accumulators_.begin(), leaf_accumulators_.end(), + iter->second.get()) == leaf_accumulators_.end()) { + leaf_accumulators_.push_back(iter->second.get()); + } if (iter->second->HasInnerVar()) { var = iter->second->InnerVar(); diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h index 87c4ea380f3..a2ad8b5f8aa 100644 --- a/paddle/fluid/imperative/basic_engine.h +++ b/paddle/fluid/imperative/basic_engine.h @@ -69,7 +69,9 @@ class BasicEngine : public Engine { std::vector>> need_accu_var_list_; // leaf_accumulators_ is only for leaf tensor(hooks/accumulate grad) - std::unordered_set leaf_accumulators_; + // It should be orderly and not repeated, because multiple cards must ensure + // that the order of vars is the same. + std::vector leaf_accumulators_; bool retain_graph_; }; -- GitLab