未验证 提交 611933a0 编写于 作者: D daquexian 提交者: GitHub

fix multi machine test (#5984)

Signed-off-by: Ndaquexian <daquexian566@gmail.com>
Co-authored-by: Noneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
上级 fc60e225
......@@ -104,7 +104,7 @@ def has_node_list():
def node_size():
node_num_from_env = os.getenv("ONEFLOW_TEST_NODE_NUM", None)
if node_num_from_env:
return node_num_from_env
return int(node_num_from_env)
elif has_node_list():
node_list_from_env = node_list()
return len(node_list_from_env)
......
......@@ -34,30 +34,16 @@ class TestAllReduce(flow.unittest.TestCase):
x = flow.Tensor(arr_rank2)
else:
raise ValueError
x = x.to(f"cuda:{flow.distributed.get_local_rank()}")
nccl_allreduce_op = (
flow.builtin_op("eager_nccl_all_reduce")
.Input("in")
.Output("out")
.Attr("parallel_conf", f'device_tag: "gpu", device_name: "0:0-1"')
.Build()
)
y = nccl_allreduce_op(x)[0]
x = x.to("cuda")
y = flow.F.all_reduce(x)
test_case.assertTrue(np.allclose(y.numpy(), arr_rank1 + arr_rank2))
@flow.unittest.skip_unless_2n2d()
def test_all_reduce_2nodes(test_case):
np_arr = np.array([1, 2])
x = flow.Tensor(np_arr * flow.distributed.get_rank())
x = x.to(f"cuda:{flow.distributed.get_local_rank()}")
nccl_allreduce_op = (
flow.builtin_op("eager_nccl_all_reduce")
.Input("in")
.Output("out")
.Attr("parallel_conf", f'device_tag: "gpu", device_name: "0-1:0-1"')
.Build()
)
y = nccl_allreduce_op(x)[0]
x = flow.Tensor(np_arr * (flow.distributed.get_rank() + 1))
x = x.to("cuda")
y = flow.F.all_reduce(x)
test_case.assertTrue(np.allclose(y.numpy(), np_arr * 10))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册