From 7e8132832ff8b3e322b6a345c65e2ea2795169dd Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Thu, 27 Feb 2020 07:22:57 -0800 Subject: [PATCH] MPI 3.x support via mpi4py (#107) * add mpirun support for openmpi 4.0 * add master addr support from args * switch mpi detection to use mpi4py * set constant for default distributed port * Make sure deepspeed_mpi exits in args --- deepspeed/__init__.py | 9 +++++++ deepspeed/pt/deepspeed_constants.py | 5 ++++ deepspeed/pt/deepspeed_launch.py | 19 ++++++--------- deepspeed/pt/deepspeed_light.py | 38 ++++++++++++++++++++++++++++- deepspeed/pt/deepspeed_run.py | 3 ++- 5 files changed, 61 insertions(+), 13 deletions(-) mode change 100644 => 100755 deepspeed/pt/deepspeed_launch.py mode change 100644 => 100755 deepspeed/pt/deepspeed_light.py diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index b7f9072e..7166e0f8 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -143,6 +143,15 @@ def _add_core_arguments(parser): default=None, type=str, help='Deprecated DeepSpeed json configuration file.') + + group.add_argument( + '--deepspeed_mpi', + default=False, + action='store_true', + help= + "Run via MPI, this will attempt to discover the necessary variables to initialize torch " + "distributed from the MPI environment") + return parser diff --git a/deepspeed/pt/deepspeed_constants.py b/deepspeed/pt/deepspeed_constants.py index dbf87c45..df8d0908 100644 --- a/deepspeed/pt/deepspeed_constants.py +++ b/deepspeed/pt/deepspeed_constants.py @@ -31,6 +31,11 @@ SCHEDULER_TYPE_DEFAULT = None SCHEDULER_PARAMS = "params" MAX_GRAD_NORM = 'max_grad_norm' +############################################# +# Torch distributed constants +############################################# +TORCH_DISTRIBUTED_DEFAULT_PORT = "29500" + # Steps STEPS_PER_PRINT = "steps_per_print" STEPS_PER_PRINT_DEFAULT = 10 diff --git a/deepspeed/pt/deepspeed_launch.py b/deepspeed/pt/deepspeed_launch.py old mode 100644 new mode 100755 index 575415fb..0984996b --- a/deepspeed/pt/deepspeed_launch.py +++ b/deepspeed/pt/deepspeed_launch.py @@ -5,19 +5,16 @@ Copyright 2020 The Microsoft DeepSpeed Team: deepspeed@microsoft.com import sys import subprocess import os -import socket import json import base64 from collections import defaultdict from argparse import ArgumentParser, REMAINDER -import torch - def parse_args(): - parser = ArgumentParser(description="DeepSpeed distributed training launch " - "utilty that creates multiple distributed " - "processes on a single node") + parser = ArgumentParser(description="DeepSpeed distributed training launch" + " utility that creates multiple distributed" + " processes on a single node") # Optional arguments for the launch helper parser.add_argument("--node_rank", @@ -28,15 +25,15 @@ def parse_args(): parser.add_argument("--master_addr", default="127.0.0.1", type=str, - help="Master node (rank 0)'s address, should be either " - "the IP address or the hostname of node 0, for " - "single node multi-proc training, the " - "--master_addr can simply be 127.0.0.1") + help="Master node (rank 0)'s address, should be either" + " the IP address or the hostname of node 0, for" + " single node multi-proc training, the" + " --master_addr can simply be 127.0.0.1") parser.add_argument("--master_port", default=29500, type=int, help="Master node (rank 0)'s free port that needs to " - "be used for communciation during distributed " + "be used for communication during distributed " "training") parser.add_argument("--world_info", default="None", diff --git a/deepspeed/pt/deepspeed_light.py b/deepspeed/pt/deepspeed_light.py old mode 100644 new mode 100755 index 4d210cd9..0207489b --- a/deepspeed/pt/deepspeed_light.py +++ b/deepspeed/pt/deepspeed_light.py @@ -21,7 +21,7 @@ from deepspeed.pt.deepspeed_config import DeepSpeedConfig, \ from deepspeed.pt.deepspeed_dataloader import DeepSpeedDataLoader from deepspeed.pt.deepspeed_constants import ROUTE_TRAIN, ROUTE_PREDICT, \ - ROUTE_EVAL + ROUTE_EVAL, TORCH_DISTRIBUTED_DEFAULT_PORT import deepspeed.pt.deepspeed_lr_schedules as lr_schedules from deepspeed.pt.deepspeed_csr_tensor import CSRTensor @@ -119,6 +119,8 @@ class DeepSpeedLight(Module): self.gradient_average = True self.warn_unscaled_loss = True + self._mpi_check(args) + if dist_init_required is None: dist_init_required = not dist.is_initialized() @@ -184,6 +186,40 @@ class DeepSpeedLight(Module): if self.dump_state(): print_configuration(self, 'DeepSpeedLight') + def _mpi_check(self, args): + if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi: + from mpi4py import MPI + import subprocess + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + world_size = comm.Get_size() + + master_addr = None + if rank == 0: + hostname_cmd = ["hostname -I"] + result = subprocess.check_output(hostname_cmd, shell=True) + master_addr = result.decode('utf-8').split()[0] + master_addr = comm.bcast(master_addr, root=0) + + # Determine local rank by assuming hostnames are unique + proc_name = MPI.Get_processor_name() + all_procs = comm.allgather(proc_name) + local_rank = sum([i == proc_name for i in all_procs[:rank]]) + + os.environ['RANK'] = str(rank) + os.environ['WORLD_SIZE'] = str(world_size) + args.local_rank = local_rank + os.environ['MASTER_ADDR'] = master_addr + os.environ['MASTER_PORT'] = TORCH_DISTRIBUTED_DEFAULT_PORT + + logging.info( + "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}" + .format(os.environ['RANK'], + args.local_rank, + os.environ['WORLD_SIZE'], + os.environ['MASTER_ADDR'], + os.environ['MASTER_PORT'])) + def tensorboard_enabled(self): return self._config.tensorboard_enabled diff --git a/deepspeed/pt/deepspeed_run.py b/deepspeed/pt/deepspeed_run.py index 0d935a89..39074ba2 100644 --- a/deepspeed/pt/deepspeed_run.py +++ b/deepspeed/pt/deepspeed_run.py @@ -13,6 +13,7 @@ import argparse import subprocess import collections from copy import deepcopy +from deepspeed.pt.deepspeed_constants import TORCH_DISTRIBUTED_DEFAULT_PORT DLTS_HOSTFILE = "/job/hostfile" EXPORT_ENVS = ["NCCL", "PYTHONPATH"] @@ -61,7 +62,7 @@ def parse_args(args=None): parser.add_argument("--num_gpus", type=int, default=-1, help="") parser.add_argument("--master_port", - default=29500, + default=int(TORCH_DISTRIBUTED_DEFAULT_PORT), type=int, help="(optional) Port used by PyTorch distributed for " "communication during training.") -- GitLab