fix multi machine test (#5984)

Signed-off-by: N daquexian <daquexian566@gmail.com> Co-authored-by: N oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>

fix multi machine test (#5984)
Signed-off-by: N daquexian <daquexian566@gmail.com> Co-authored-by: N oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
611933a0 · daquexian · GitHub · fc60e225 · 611933a0 · 611933a0
隐藏空白更改
内联并排

Showing with 6 addition and 20 deletion

python/oneflow/framework/unittest.py python/oneflow/framework/unittest.py +1 -1

python/oneflow/test/modules/test_allreduce.py python/oneflow/test/modules/test_allreduce.py +5 -19

未找到文件。
--- a/python/oneflow/framework/unittest.py
+++ b/python/oneflow/framework/unittest.py
@@ -104,7 +104,7 @@ def has_node_list():
 def node_size():
    node_num_from_env = os.getenv("ONEFLOW_TEST_NODE_NUM", None)
    if node_num_from_env:
-        return node_num_from_env
+        return int(node_num_from_env)
    elif has_node_list():
        node_list_from_env = node_list()
        return len(node_list_from_env)

--- a/python/oneflow/test/modules/test_allreduce.py
+++ b/python/oneflow/test/modules/test_allreduce.py
@@ -34,30 +34,16 @@ class TestAllReduce(flow.unittest.TestCase):
            x = flow.Tensor(arr_rank2)
        else:
            raise ValueError
-        x = x.to(f"cuda:{flow.distributed.get_local_rank()}")
-        nccl_allreduce_op = (
-            flow.builtin_op("eager_nccl_all_reduce")
-            .Input("in")
-            .Output("out")
-            .Attr("parallel_conf", f'device_tag: "gpu", device_name: "0:0-1"')
-            .Build()
-        )
-        y = nccl_allreduce_op(x)[0]
+        x = x.to("cuda")
+        y = flow.F.all_reduce(x)
        test_case.assertTrue(np.allclose(y.numpy(), arr_rank1 + arr_rank2))

    @flow.unittest.skip_unless_2n2d()
    def test_all_reduce_2nodes(test_case):
        np_arr = np.array([1, 2])
-        x = flow.Tensor(np_arr * flow.distributed.get_rank())
-        x = x.to(f"cuda:{flow.distributed.get_local_rank()}")
-        nccl_allreduce_op = (
-            flow.builtin_op("eager_nccl_all_reduce")
-            .Input("in")
-            .Output("out")
-            .Attr("parallel_conf", f'device_tag: "gpu", device_name: "0-1:0-1"')
-            .Build()
-        )
-        y = nccl_allreduce_op(x)[0]
+        x = flow.Tensor(np_arr * (flow.distributed.get_rank() + 1))
+        x = x.to("cuda")
+        y = flow.F.all_reduce(x)
        test_case.assertTrue(np.allclose(y.numpy(), np_arr * 10))