未验证 提交 9840fb70 编写于 作者: W Weilong Wu 提交者: GitHub

[Eager] Support test_dist_hapi_model under eager mode (#42702)

* [Eager] Support test_dist_hapi_model under eager mode

* [Eager] Polish code

* Fix code-format issue, coverage-ci issue
上级 cbc5ca0f
...@@ -14,7 +14,9 @@ ...@@ -14,7 +14,9 @@
from __future__ import print_function from __future__ import print_function
from ..layer_helper import LayerHelper, unique_name from ..layer_helper import LayerHelper, unique_name
from ..framework import Variable from ..framework import Variable, in_dygraph_mode, _in_legacy_dygraph
import paddle
from paddle import _C_ops
def _allreduce(x, out=None, reduce_type="sum", sync_mode=False): def _allreduce(x, out=None, reduce_type="sum", sync_mode=False):
...@@ -107,6 +109,21 @@ def _c_broadcast(x, root=0, ring_id=0, use_calc_stream=False): ...@@ -107,6 +109,21 @@ def _c_broadcast(x, root=0, ring_id=0, use_calc_stream=False):
def _c_allgather(x, nranks, ring_id=0, use_calc_stream=False): def _c_allgather(x, nranks, ring_id=0, use_calc_stream=False):
op_type = 'c_allgather' op_type = 'c_allgather'
if in_dygraph_mode():
group = paddle.distributed.collective._get_default_group()
tensor_shape = list(x.shape)
tensor_shape[0] *= nranks
out = paddle.empty(tensor_shape, x.dtype)
task = group.process_group.all_gather(x, out)
task.wait()
return out
if _in_legacy_dygraph():
attrs = ('nranks', nranks, 'ring_id', ring_id, 'use_calc_stream',
use_calc_stream)
return _C_ops.c_allgather(x, *attrs)
helper = LayerHelper(op_type, **locals()) helper = LayerHelper(op_type, **locals())
out_shape = list(x.shape[:]) out_shape = list(x.shape[:])
if out_shape[0] > 0: if out_shape[0] > 0:
......
...@@ -29,7 +29,7 @@ import contextlib ...@@ -29,7 +29,7 @@ import contextlib
import paddle import paddle
from paddle import fluid from paddle import fluid
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.framework import _non_static_mode from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
from paddle.fluid.framework import Variable from paddle.fluid.framework import Variable
from paddle.fluid.framework import _get_paddle_place from paddle.fluid.framework import _get_paddle_place
from paddle.fluid.framework import _current_expected_place as _get_device from paddle.fluid.framework import _current_expected_place as _get_device
...@@ -761,6 +761,15 @@ class DynamicGraphAdapter(object): ...@@ -761,6 +761,15 @@ class DynamicGraphAdapter(object):
labels = [to_variable(l) for l in to_list(labels)] labels = [to_variable(l) for l in to_list(labels)]
outputs = self.model.network.forward(*[to_variable(x) for x in inputs]) outputs = self.model.network.forward(*[to_variable(x) for x in inputs])
# Transfrom data to expected device
expected_device = paddle.device.get_device()
for o in to_list(outputs):
o._to(device=expected_device)
for l in labels:
l._to(device=expected_device)
if self.model._loss: if self.model._loss:
losses = self.model._loss(*(to_list(outputs) + labels)) losses = self.model._loss(*(to_list(outputs) + labels))
losses = to_list(losses) losses = to_list(losses)
...@@ -2088,7 +2097,6 @@ class Model(object): ...@@ -2088,7 +2097,6 @@ class Model(object):
callbacks.on_batch_begin(mode, step, logs) callbacks.on_batch_begin(mode, step, logs)
if mode != 'predict': if mode != 'predict':
_inputs = [data[:len(self._inputs)], data[len(self._inputs):]] _inputs = [data[:len(self._inputs)], data[len(self._inputs):]]
if mode == 'train': if mode == 'train':
_inputs.append((step + 1) % self._accumulate == 0 or _inputs.append((step + 1) % self._accumulate == 0 or
......
...@@ -58,7 +58,7 @@ def compute_accuracy(pred, gt): ...@@ -58,7 +58,7 @@ def compute_accuracy(pred, gt):
@unittest.skipIf(not fluid.is_compiled_with_cuda(), @unittest.skipIf(not fluid.is_compiled_with_cuda(),
'CPU testing is not supported') 'CPU testing is not supported')
class TestDistTraning(unittest.TestCase): class TestDistTraning(unittest.TestCase):
def test_static_multiple_gpus(self): def test_dynamic_multiple_gpus(self):
device = set_device('gpu') device = set_device('gpu')
im_shape = (-1, 1, 28, 28) im_shape = (-1, 1, 28, 28)
......
...@@ -52,6 +52,7 @@ def get_gpus(selected_gpus): ...@@ -52,6 +52,7 @@ def get_gpus(selected_gpus):
def start_local_trainers(cluster, def start_local_trainers(cluster,
pod, pod,
training_script, training_script,
eager_mode,
training_script_args, training_script_args,
log_dir=None): log_dir=None):
current_env = copy.copy(os.environ.copy()) current_env = copy.copy(os.environ.copy())
...@@ -72,6 +73,9 @@ def start_local_trainers(cluster, ...@@ -72,6 +73,9 @@ def start_local_trainers(cluster,
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
} }
if not eager_mode:
proc_env["FLAGS_enable_eager_mode"] = "%d" % 0
current_env.update(proc_env) current_env.update(proc_env)
print("trainer proc env:{}".format(current_env)) print("trainer proc env:{}".format(current_env))
...@@ -99,7 +103,7 @@ def start_local_trainers(cluster, ...@@ -99,7 +103,7 @@ def start_local_trainers(cluster,
class TestMultipleGpus(unittest.TestCase): class TestMultipleGpus(unittest.TestCase):
def run_mnist_2gpu(self, target_file_name): def run_mnist_2gpu(self, target_file_name, eager_mode=True):
if fluid.core.get_cuda_device_count() == 0: if fluid.core.get_cuda_device_count() == 0:
return return
...@@ -112,6 +116,7 @@ class TestMultipleGpus(unittest.TestCase): ...@@ -112,6 +116,7 @@ class TestMultipleGpus(unittest.TestCase):
procs = start_local_trainers( procs = start_local_trainers(
cluster, cluster,
pod, pod,
eager_mode=eager_mode,
training_script=target_file_name, training_script=target_file_name,
training_script_args=[]) training_script_args=[])
...@@ -125,13 +130,17 @@ class TestMultipleGpus(unittest.TestCase): ...@@ -125,13 +130,17 @@ class TestMultipleGpus(unittest.TestCase):
def test_hapi_multiple_gpus_static(self): def test_hapi_multiple_gpus_static(self):
self.run_mnist_2gpu('dist_hapi_mnist_static.py') self.run_mnist_2gpu('dist_hapi_mnist_static.py')
self.run_mnist_2gpu('dist_hapi_mnist_static.py', eager_mode=False)
def test_hapi_multiple_gpus_dynamic(self): def test_hapi_multiple_gpus_dynamic(self):
self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py') self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py')
self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py', eager_mode=False)
def test_hapi_amp_static(self): def test_hapi_amp_static(self):
self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py') self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py')
self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py', eager_mode=False)
if __name__ == "__main__": if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册