add allreduce test (#7)

* add allreduce test * comment out set rank to cuda for now * switched back to gloo

add allreduce test (#7)
* add allreduce test * comment out set rank to cuda for now * switched back to gloo
52c5a936 · Jeff Rasley · GitHub · b61a2217 · 52c5a936 · 52c5a936
隐藏空白更改
内联并排

Showing with 12 addition and 3 deletion

tests/unit/common.py tests/unit/common.py +3 -3

tests/unit/test_dist.py tests/unit/test_dist.py +9 -0

未找到文件。
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -11,7 +11,7 @@ import pytest
 DEEPSPEED_UNIT_WORKER_TIMEOUT = 5


-def distributed_test(world_size=2):
+def distributed_test(world_size=2, backend='gloo'):
    """A decorator for executing a function (e.g., a unit test) in a distributed manner.
    This decorator manages the spawning and joining of processes, initialization of
    torch.distributed, and catching of errors.
@@ -33,14 +33,14 @@ def distributed_test(world_size=2):
            """Initialize torch.distributed and execute the user function. """
            os.environ['MASTER_ADDR'] = '127.0.0.1'
            os.environ['MASTER_PORT'] = '29500'
-            dist.init_process_group(backend='nccl',
+            dist.init_process_group(backend=backend,
                                    init_method='env://',
                                    rank=local_rank,
                                    world_size=num_procs)

            # XXX temporarily disabled due to CUDA runtime error?
            #if torch.cuda.is_available():
-            #   torch.cuda.set_device(local_rank)
+            #    torch.cuda.set_device(local_rank)

            run_func(*func_args, **func_kwargs)


--- a/tests/unit/test_dist.py
+++ b/tests/unit/test_dist.py
+import torch
 import torch.distributed as dist

 from common import distributed_test
@@ -26,3 +27,11 @@ def test_dist_args(number, color):

    """Ensure that we can parse args to distributed_test decorated functions. """
    _test_dist_args_helper(number, color=color)
+
+
+@distributed_test(world_size=2)
+def test_dist_allreduce():
+    x = torch.ones(1, 3) * (dist.get_rank() + 1)
+    result = torch.ones(1, 3) * 3
+    dist.all_reduce(x)
+    assert torch.all(x == result)