提交 4ace67ff 编写于 作者: M Megvii Engine Team

test(mge/distributed): check gpu num for multi gpu test

GitOrigin-RevId: 78f471868219795eb9325a0632d454cf53bd6226
上级 9d5c5c07
......@@ -30,7 +30,7 @@ def test_reduce_sum():
world_size = 2
def worker(rank, data, backend, expect, port_queue):
if not mge.is_cuda_available():
if mge.get_device_count("gpu") < world_size:
return
_init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
inp = tensor(data)
......@@ -66,7 +66,7 @@ def test_gather():
world_size = 2
def worker(rank, data, backend, expect, port_queue):
if not mge.is_cuda_available():
if mge.get_device_count("gpu") < world_size:
return
_init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
inp = tensor(data)
......@@ -102,7 +102,7 @@ def test_broadcast():
world_size = 2
def worker(rank, data, backend, expect, port_queue):
if not mge.is_cuda_available():
if mge.get_device_count("gpu") < world_size:
return
_init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
inp = tensor(data)
......@@ -134,7 +134,7 @@ def test_scatter():
world_size = 2
def worker(rank, data, backend, expect, port_queue):
if not mge.is_cuda_available():
if mge.get_device_count("gpu") < world_size:
return
_init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
inp = tensor(data)
......@@ -170,7 +170,7 @@ def test_all_to_all():
world_size = 2
def worker(rank, data, backend, expect, port_queue):
if not mge.is_cuda_available():
if mge.get_device_count("gpu") < world_size:
return
_init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
inp = tensor(data)
......@@ -204,7 +204,7 @@ def test_all_gather():
world_size = 2
def worker(rank, data, backend, expect, port_queue):
if not mge.is_cuda_available():
if mge.get_device_count("gpu") < world_size:
return
_init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
inp = tensor(data)
......@@ -237,7 +237,7 @@ def test_reduce_scatter_sum():
world_size = 2
def worker(rank, data, backend, expect, port_queue):
if not mge.is_cuda_available():
if mge.get_device_count("gpu") < world_size:
return
_init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
inp = tensor(data)
......@@ -274,7 +274,7 @@ def test_all_reduce_sum():
world_size = 2
def worker(rank, data, backend, expect, port_queue):
if not mge.is_cuda_available():
if mge.get_device_count("gpu") < world_size:
return
_init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
inp = tensor(data)
......@@ -307,7 +307,7 @@ def test_all_reduce_max():
world_size = 2
def worker(rank, data, backend, expect, port_queue):
if not mge.is_cuda_available():
if mge.get_device_count("gpu") < world_size:
return
_init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
inp = tensor(data)
......@@ -340,7 +340,7 @@ def test_all_reduce_min():
world_size = 2
def worker(rank, data, backend, expect, port_queue):
if not mge.is_cuda_available():
if mge.get_device_count("gpu") < world_size:
return
_init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
inp = tensor(data)
......@@ -373,7 +373,7 @@ def test_bcast_param():
world_size = 2
def worker(rank, data, backend, expect, port_queue):
if not mge.is_cuda_available():
if mge.get_device_count("gpu") < world_size:
return
_init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
inp = Parameter(data)
......
......@@ -27,11 +27,12 @@ def test_syncbn():
running_mean = np.zeros((1, nr_chan, 1, 1), dtype=np.float32)
running_var = np.ones((1, nr_chan, 1, 1), dtype=np.float32)
steps = 4
nr_ranks = 2
def worker(rank, data, yv_expect, running_mean, running_var):
if not mge.is_cuda_available():
if mge.get_device_count("gpu") < nr_ranks:
return
dist.init_process_group("localhost", 2333, 4, rank, rank)
dist.init_process_group("localhost", 2333, nr_ranks, rank, rank)
bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps)
data_tensor = tensor()
for i in range(steps):
......@@ -61,19 +62,19 @@ def test_syncbn():
yv_expect = (xv[i] - mean) / sd
data = []
for i in range(4):
for i in range(nr_ranks):
data.append([])
for j in range(steps):
data[i].append(xv[j][:, :, :, i * 4 : i * 4 + 4])
data[i].append(xv[j][:, :, :, i * 8 : i * 8 + 8])
procs = []
for rank in range(4):
for rank in range(nr_ranks):
p = mp.Process(
target=worker,
args=(
rank,
data[rank],
yv_expect[:, :, :, rank * 4 : rank * 4 + 4],
yv_expect[:, :, :, rank * 8 : rank * 8 + 8],
running_mean,
running_var,
),
......@@ -82,7 +83,7 @@ def test_syncbn():
procs.append(p)
for p in procs:
p.join()
p.join(10)
assert p.exitcode == 0
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册