Merge pull request #9560 from chengduoZH/feature/fix_parallel_exe

Broadcast the gradient once it is generated

Merge pull request #9560 from chengduoZH/feature/fix_parallel_exe
Broadcast the gradient once it is generated
dd75fbde · chengduo · GitHub · a4e437d5 · 494bee51 · dd75fbde
显示空白变更内容
内联并排

Showing with 9 addition and 2 deletion

paddle/fluid/framework/details/multi_devices_graph_builder.cc ...le/fluid/framework/details/multi_devices_graph_builder.cc +9 -2

未找到文件。
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -55,6 +55,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    const ProgramDesc &program) const {
  auto graph = new SSAGraph();
  SSAGraph &result = *graph;
+  std::unordered_set<std::string> og_has_been_broadcast;
  result.vars_.resize(places_.size());

  bool is_forwarding = true;
@@ -122,9 +123,15 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(

    if (!is_forwarding) {
      auto var_names = op->OutputArgumentNames();
+      // Currently, we assume that once gradient is generated, it can be
+      // broadcast, and each gradient is only broadcast once. But there are no
+      // other cases, for example, we need to adjust the gradient according to
+      // the input when we get the gradient, which is not considered at present.
      for (auto &og : var_names) {
-        if (grad_names_.count(og) != 0) {  // is param grad
+        if (grad_names_.count(og) != 0 &&
+            og_has_been_broadcast.count(og) == 0) {  // is param grad
                                                     // Insert NCCL AllReduce Op
+          og_has_been_broadcast.insert(og);
 #ifdef PADDLE_WITH_CUDA
          result.ops_.emplace_back(
              new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));