[Dygraph] Fix bugs of EagerReducer for complex control flows (#43252)

* fix bugs of reducer * update * update

[Dygraph] Fix bugs of EagerReducer for complex control flows (#43252)
* fix bugs of reducer * update * update
2922985a · Haohongxiang · GitHub · 42dd0f1b · 2922985a · 2922985a
5 changed file
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -775,6 +775,13 @@ void EagerReducer::ProcessUnusedDenseVars() {
        continue;
      }
+      // NOTE(haohongxiang): Calling SetFakeEmpty here is to make sure that
+      // gradient accumulation can continue normally after clear_gradients()
+      // especiall in cases including complex control flow.
+      std::static_pointer_cast<egr::GradNodeAccumulation>(
+          GetGradNodeFromTensor(&tensors_[var_index]))
+          ->SetFakeEmpty(false);
      Tensor grad_value(std::make_shared<phi::DenseTensor>(src_tensor));
      auto dest_var_base = tensors_[var_index];

--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -43,12 +43,11 @@ def _apply_collective_grads(parameters, comm_group):
    coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024)
+    nranks = paddle.distributed.get_world_size(
+    ) if comm_group is None else comm_group.nranks
    for coalesced_grad, _, _ in coalesced_grads_and_vars:
        # need to div nranks
-        nranks = paddle.distributed.get_world_size(
-        ) if comm_group is None else comm_group.nranks
        div_factor = paddle.to_tensor(nranks, dtype=coalesced_grad.dtype)
-        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
        paddle.fluid.framework._dygraph_tracer().trace_op(
            type="elementwise_div",
            inputs={
@@ -57,6 +56,7 @@ def _apply_collective_grads(parameters, comm_group):
            },
            outputs={'Out': coalesced_grad},
            attrs={'axis': -1})
+        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
    _split_tensors(coalesced_grads_and_vars)
@@ -76,10 +76,11 @@ def _apply_collective_grads_eager(parameters, comm_group):
    coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024)
-    div_factor = 1.0 / comm_group.nranks
+    nranks = paddle.distributed.get_world_size(
+    ) if comm_group is None else comm_group.nranks
    for coalesced_grad, _, _ in coalesced_grads_and_vars:
        # need to div nranks
-        coalesced_grad.scale_(div_factor)
+        coalesced_grad.scale_(1.0 / nranks)
        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
    _split_tensors(coalesced_grads_and_vars)

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1507,7 +1507,7 @@ if(WITH_DISTRIBUTE
                                                                     350)
  set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 300)
  set_tests_properties(test_parallel_dygraph_no_sync_gradient_check
-                       PROPERTIES TIMEOUT 30)
+                       PROPERTIES TIMEOUT 60)
  set_tests_properties(test_parallel_dygraph_pipeline_parallel
                       PROPERTIES TIMEOUT 500)
  set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT

--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -200,7 +200,8 @@ class TestMultipleWithGloo(unittest.TestCase):
 class TestDataParallelGradientCheck(TestMultipleGpus):
    def test_multiple_gpus_dynamic(self):
-        self.run_mnist_2gpu('parallel_dygraph_gradient_check.py')
+        self.run_mnist_2gpu('parallel_dygraph_gradient_check.py',
+                            eager_mode=False)
 class TestDataParallelWithPyLayer(TestMultipleGpus):
@@ -218,4 +219,5 @@ class TestGradientCheckInEagerMode(TestMultipleGpus):
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py
@@ -14,6 +14,7 @@
 from __future__ import print_function
+import os
 import unittest
 import paddle.fluid as fluid
@@ -24,7 +25,10 @@ class TestDataParallelLayer(TestMultipleGpus):
    def test_parallel_dygraph_dataparallel_no_sync(self):
        self.run_mnist_2gpu('parallel_dygraph_no_sync_gradient_check.py')
+        self.run_mnist_2gpu('parallel_dygraph_no_sync_gradient_check.py',
+                            eager_mode=False)
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
    unittest.main()