deepspeed.init_distributed() support for TCP protocols (#2905)

Co-authored-by: N Jeff Rasley <jerasley@microsoft.com>

deepspeed.init_distributed() support for TCP protocols (#2905)
Co-authored-by: N Jeff Rasley <jerasley@microsoft.com>
db15ef57 · noabauma · GitHub · 0acf7e9c · db15ef57 · db15ef57
隐藏空白更改
内联并排

Showing with 18 addition and 6 deletion

deepspeed/comm/comm.py deepspeed/comm/comm.py +6 -2

deepspeed/comm/torch.py deepspeed/comm/torch.py +12 -4

未找到文件。
--- a/deepspeed/comm/comm.py
+++ b/deepspeed/comm/comm.py
@@ -603,7 +603,9 @@ def init_distributed(dist_backend=None,
                     timeout=default_pg_timeout,
                     init_method=None,
                     dist_init_required=None,
-                     config=None):
+                     config=None,
+                     rank=-1,
+                     world_size=-1):
    ''' Initialize dist backend, potentially performing MPI discovery if needed

    Arguments:
@@ -614,6 +616,8 @@ def init_distributed(dist_backend=None,
        timeout: Optional (timedelta). Timeout for operations executed against the process group. Default value equals 30 minutes.
        init_method: Optional (string). Torch distributed, URL specifying how to initialize the process group. Default is “env://” if no init_method or store is specified.
        config: Optional (dict). DeepSpeed configuration for setting up comms options (e.g. Comms profiling)
+        rank: Optional (int). The current manually specified rank. Some init_method like “tcp://” need the rank and world_size as well (see: https://pytorch.org/docs/stable/distributed.html#tcp-initialization)
+        world_size: Optional (int). Desired world_size for the TCP or Shared file-system initialization.
    '''
    global cdb

@@ -658,7 +662,7 @@ def init_distributed(dist_backend=None,
                    'Initializing TorchBackend in DeepSpeed with backend {}'.format(
                        dist_backend))
            # Create a torch backend object, initialize torch distributed, and assign to cdb
-            cdb = TorchBackend(dist_backend, timeout, init_method)
+            cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)


 def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True):

--- a/deepspeed/comm/torch.py
+++ b/deepspeed/comm/torch.py
@@ -16,7 +16,13 @@ class TorchBackend(Backend):
        so no need to wrap all the functions. We can keep adding wrappers as
        needed.
    """
-    def __init__(self, backend, timeout, init_method, name='torch'):
+    def __init__(self,
+                 backend,
+                 timeout,
+                 init_method,
+                 rank=-1,
+                 world_size=-1,
+                 name='torch'):
        super(TorchBackend, self).__init__()
        self.torch_version_before_18 = older_torch()
        self.has_allgather_base = has_allgather_base()
@@ -27,13 +33,15 @@ class TorchBackend(Backend):
        # The idea is to fake that dist backend is initialized even when
        # it is not so we can run on a single GPU without doing any init_process_group
        self.single_gpu_mode = True
-        self.init_process_group(backend, timeout, init_method)
+        self.init_process_group(backend, timeout, init_method, rank, world_size)

-    def init_process_group(self, backend, timeout, init_method):
+    def init_process_group(self, backend, timeout, init_method, rank, world_size):
        if not torch.distributed.is_initialized():
            torch.distributed.init_process_group(backend,
                                                 timeout=timeout,
-                                                 init_method=init_method)
+                                                 init_method=init_method,
+                                                 rank=rank,
+                                                 world_size=world_size)
        self.using_mpi = torch.distributed.get_backend() == 'mpi'

    def all_reduce(self,