From 4ace67ff4436da27d0f0a96f909322056b441d35 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 24 Jun 2020 14:13:06 +0800 Subject: [PATCH] test(mge/distributed): check gpu num for multi gpu test GitOrigin-RevId: 78f471868219795eb9325a0632d454cf53bd6226 --- .../test/unit/distributed/test_functional.py | 22 +++++++++---------- .../test/unit/module/test_batchnorm.py | 15 +++++++------ 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/python_module/test/unit/distributed/test_functional.py b/python_module/test/unit/distributed/test_functional.py index a70484ba6..f34cb5e62 100644 --- a/python_module/test/unit/distributed/test_functional.py +++ b/python_module/test/unit/distributed/test_functional.py @@ -30,7 +30,7 @@ def test_reduce_sum(): world_size = 2 def worker(rank, data, backend, expect, port_queue): - if not mge.is_cuda_available(): + if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = tensor(data) @@ -66,7 +66,7 @@ def test_gather(): world_size = 2 def worker(rank, data, backend, expect, port_queue): - if not mge.is_cuda_available(): + if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = tensor(data) @@ -102,7 +102,7 @@ def test_broadcast(): world_size = 2 def worker(rank, data, backend, expect, port_queue): - if not mge.is_cuda_available(): + if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = tensor(data) @@ -134,7 +134,7 @@ def test_scatter(): world_size = 2 def worker(rank, data, backend, expect, port_queue): - if not mge.is_cuda_available(): + if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = tensor(data) @@ -170,7 +170,7 @@ def test_all_to_all(): world_size = 2 def worker(rank, data, backend, expect, port_queue): - if not mge.is_cuda_available(): + if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = tensor(data) @@ -204,7 +204,7 @@ def test_all_gather(): world_size = 2 def worker(rank, data, backend, expect, port_queue): - if not mge.is_cuda_available(): + if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = tensor(data) @@ -237,7 +237,7 @@ def test_reduce_scatter_sum(): world_size = 2 def worker(rank, data, backend, expect, port_queue): - if not mge.is_cuda_available(): + if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = tensor(data) @@ -274,7 +274,7 @@ def test_all_reduce_sum(): world_size = 2 def worker(rank, data, backend, expect, port_queue): - if not mge.is_cuda_available(): + if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = tensor(data) @@ -307,7 +307,7 @@ def test_all_reduce_max(): world_size = 2 def worker(rank, data, backend, expect, port_queue): - if not mge.is_cuda_available(): + if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = tensor(data) @@ -340,7 +340,7 @@ def test_all_reduce_min(): world_size = 2 def worker(rank, data, backend, expect, port_queue): - if not mge.is_cuda_available(): + if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = tensor(data) @@ -373,7 +373,7 @@ def test_bcast_param(): world_size = 2 def worker(rank, data, backend, expect, port_queue): - if not mge.is_cuda_available(): + if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = Parameter(data) diff --git a/python_module/test/unit/module/test_batchnorm.py b/python_module/test/unit/module/test_batchnorm.py index d23a10f16..41b0fed7e 100644 --- a/python_module/test/unit/module/test_batchnorm.py +++ b/python_module/test/unit/module/test_batchnorm.py @@ -27,11 +27,12 @@ def test_syncbn(): running_mean = np.zeros((1, nr_chan, 1, 1), dtype=np.float32) running_var = np.ones((1, nr_chan, 1, 1), dtype=np.float32) steps = 4 + nr_ranks = 2 def worker(rank, data, yv_expect, running_mean, running_var): - if not mge.is_cuda_available(): + if mge.get_device_count("gpu") < nr_ranks: return - dist.init_process_group("localhost", 2333, 4, rank, rank) + dist.init_process_group("localhost", 2333, nr_ranks, rank, rank) bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps) data_tensor = tensor() for i in range(steps): @@ -61,19 +62,19 @@ def test_syncbn(): yv_expect = (xv[i] - mean) / sd data = [] - for i in range(4): + for i in range(nr_ranks): data.append([]) for j in range(steps): - data[i].append(xv[j][:, :, :, i * 4 : i * 4 + 4]) + data[i].append(xv[j][:, :, :, i * 8 : i * 8 + 8]) procs = [] - for rank in range(4): + for rank in range(nr_ranks): p = mp.Process( target=worker, args=( rank, data[rank], - yv_expect[:, :, :, rank * 4 : rank * 4 + 4], + yv_expect[:, :, :, rank * 8 : rank * 8 + 8], running_mean, running_var, ), @@ -82,7 +83,7 @@ def test_syncbn(): procs.append(p) for p in procs: - p.join() + p.join(10) assert p.exitcode == 0 -- GitLab