[Eager] Support test_dist_hapi_model under eager mode (#42702)

* [Eager] Support test_dist_hapi_model under eager mode * [Eager] Polish code * Fix code-format issue, coverage-ci issue

[Eager] Support test_dist_hapi_model under eager mode (#42702)
* [Eager] Support test_dist_hapi_model under eager mode * [Eager] Polish code * Fix code-format issue, coverage-ci issue
9840fb70 · Weilong Wu · GitHub · cbc5ca0f · 9840fb70 · 9840fb70
4 changed file
--- a/python/paddle/fluid/layers/collective.py
+++ b/python/paddle/fluid/layers/collective.py
@@ -14,7 +14,9 @@
 from __future__ import print_function
 from ..layer_helper import LayerHelper, unique_name
-from ..framework import Variable
+from ..framework import Variable, in_dygraph_mode, _in_legacy_dygraph
+import paddle
+from paddle import _C_ops
 def _allreduce(x, out=None, reduce_type="sum", sync_mode=False):
@@ -107,6 +109,21 @@ def _c_broadcast(x, root=0, ring_id=0, use_calc_stream=False):
 def _c_allgather(x, nranks, ring_id=0, use_calc_stream=False):
    op_type = 'c_allgather'
+    if in_dygraph_mode():
+        group = paddle.distributed.collective._get_default_group()
+        tensor_shape = list(x.shape)
+        tensor_shape[0] *= nranks
+        out = paddle.empty(tensor_shape, x.dtype)
+        task = group.process_group.all_gather(x, out)
+        task.wait()
+        return out
+    if _in_legacy_dygraph():
+        attrs = ('nranks', nranks, 'ring_id', ring_id, 'use_calc_stream',
+                 use_calc_stream)
+        return _C_ops.c_allgather(x, *attrs)
    helper = LayerHelper(op_type, **locals())
    out_shape = list(x.shape[:])
    if out_shape[0] > 0:

--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -29,7 +29,7 @@ import contextlib
 import paddle
 from paddle import fluid
 from paddle.fluid import core
-from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
 from paddle.fluid.framework import Variable
 from paddle.fluid.framework import _get_paddle_place
 from paddle.fluid.framework import _current_expected_place as _get_device
@@ -761,6 +761,15 @@ class DynamicGraphAdapter(object):
        labels = [to_variable(l) for l in to_list(labels)]
        outputs = self.model.network.forward(*[to_variable(x) for x in inputs])
+        # Transfrom data to expected device
+        expected_device = paddle.device.get_device()
+        for o in to_list(outputs):
+            o._to(device=expected_device)
+        for l in labels:
+            l._to(device=expected_device)
        if self.model._loss:
            losses = self.model._loss(*(to_list(outputs) + labels))
            losses = to_list(losses)
@@ -2088,7 +2097,6 @@ class Model(object):
            callbacks.on_batch_begin(mode, step, logs)
            if mode != 'predict':
                _inputs = [data[:len(self._inputs)], data[len(self._inputs):]]
                if mode == 'train':
                    _inputs.append((step + 1) % self._accumulate == 0 or

--- a/python/paddle/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py
@@ -58,7 +58,7 @@ def compute_accuracy(pred, gt):
 @unittest.skipIf(not fluid.is_compiled_with_cuda(),
                 'CPU testing is not supported')
 class TestDistTraning(unittest.TestCase):
-    def test_static_multiple_gpus(self):
+    def test_dynamic_multiple_gpus(self):
        device = set_device('gpu')
        im_shape = (-1, 1, 28, 28)

--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
@@ -52,6 +52,7 @@ def get_gpus(selected_gpus):
 def start_local_trainers(cluster,
                         pod,
                         training_script,
+                         eager_mode,
                         training_script_args,
                         log_dir=None):
    current_env = copy.copy(os.environ.copy())
@@ -72,6 +73,9 @@ def start_local_trainers(cluster,
            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
        }
+        if not eager_mode:
+            proc_env["FLAGS_enable_eager_mode"] = "%d" % 0
        current_env.update(proc_env)
        print("trainer proc env:{}".format(current_env))
@@ -99,7 +103,7 @@ def start_local_trainers(cluster,
 class TestMultipleGpus(unittest.TestCase):
-    def run_mnist_2gpu(self, target_file_name):
+    def run_mnist_2gpu(self, target_file_name, eager_mode=True):
        if fluid.core.get_cuda_device_count() == 0:
            return
@@ -112,6 +116,7 @@ class TestMultipleGpus(unittest.TestCase):
        procs = start_local_trainers(
            cluster,
            pod,
+            eager_mode=eager_mode,
            training_script=target_file_name,
            training_script_args=[])
@@ -125,13 +130,17 @@ class TestMultipleGpus(unittest.TestCase):
    def test_hapi_multiple_gpus_static(self):
        self.run_mnist_2gpu('dist_hapi_mnist_static.py')
+        self.run_mnist_2gpu('dist_hapi_mnist_static.py', eager_mode=False)
    def test_hapi_multiple_gpus_dynamic(self):
        self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py')
+        self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py', eager_mode=False)
    def test_hapi_amp_static(self):
        self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py')
+        self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py', eager_mode=False)
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
    unittest.main()