未验证 提交 8d53ac0c 编写于 作者: M mzl 提交者: GitHub

Add MPICH Multinode Runner (#2839)

* MPICH support

* MPICH changes

* MPICH changes

* MPICH changes

* MPICH changes

* accelerator runtime modifications

* Accelerator runtime changes

* Accelerator runtime modifications

* Remove redundant print from single node

* Move hostfile to tmp

* Code cleanup for MPICH class

* Code cleanup, rm whitespace

* Removing mpiexec environment check details

* Not needed tmp hostfile as pass directly

* Remove debugging comments

* rm print statement

* Revert comm changes as WA not needed

* Use MPICHRunner name for class

* Use MPICHRunner as class name

* No need to use args.force_multi and args.launcher .

This should be set in deepspeedexamples gpt-3.6b .sh script as:
$launcher=MPICH
run_cmd=" deepspeed  --hostfile=${hostfile_ds}  --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --launcher=${launcher} --force_multi pretrain_gpt2.py $@ ${gpt_options}"

* Adhere to code pattern

* Rm empty lines in MPICHRunner class

* Uncomment check for num nodes and workers when used hostfile_deepspeed in gpt-3.6b.sh

* pass MPICH hostfile through launcher_args in gpt-3.6b.sh

* Clean code and remove args hostfile

* fix merge

* fix merge

---------
Co-authored-by: NAbhilash Majumder <30946547+abhilash1910@users.noreply.github.com>

* clean up and fix format

* add ut

---------
Co-authored-by: NAbhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Co-authored-by: NAmmar Ahmad Awan <ammar.awan@microsoft.com>
Co-authored-by: NOlatunji Ruwase <olruwase@microsoft.com>
上级 91d7090e
......@@ -4,6 +4,7 @@ PDSH_LAUNCHER = 'pdsh'
PDSH_MAX_FAN_OUT = 1024
OPENMPI_LAUNCHER = 'openmpi'
MPICH_LAUNCHER = 'mpich'
SLURM_LAUNCHER = 'slurm'
MVAPICH_LAUNCHER = 'mvapich'
MVAPICH_TMP_HOSTFILE = '/tmp/deepspeed_mvapich_hostfile'
......
......@@ -170,6 +170,55 @@ class OpenMPIRunner(MultiNodeRunner):
] + self.user_arguments
class MPICHRunner(MultiNodeRunner):
def __init__(self, args, world_info_base64, resource_pool):
super().__init__(args, world_info_base64)
self.resource_pool = resource_pool
def backend_exists(self):
#TODO: if IB is available we should suggestion mpich
return shutil.which('mpirun') #mpich_info
@property
def name(self):
return "mpich"
def validate_args(self):
super().validate_args()
#TODO: Allow for include/exclude at node-level but not gpu-level
if self.args.include != "" or self.args.exclude != "":
raise ValueError(
f"{self.name} backend does not support worker include/exclusion")
if self.args.num_nodes != -1 or self.args.num_gpus != -1:
raise ValueError(
f"{self.name} backend does not support limiting num nodes/gpus")
def get_cmd(self, environment, active_resources):
devices_per_node = self.resource_pool.values()
total_process_count = sum(devices_per_node)
process_per_node = list(devices_per_node)[0]
mpirun_cmd = [
'mpirun',
'-n',
f'{total_process_count}',
'-ppn',
f'{process_per_node}',
] + split(self.args.launcher_args)
export_cmd = []
for k, v in self.exports.items():
export_cmd += ['-x', "{}={}".format(k, v)]
python_exec = []
if not self.args.no_python:
python_exec = [sys.executable, "-u"]
if self.args.module:
python_exec.append("-m")
return mpirun_cmd + python_exec + [self.user_script] + self.user_arguments
class SlurmRunner(MultiNodeRunner):
def __init__(self, args, world_info_base64, resource_pool):
super().__init__(args, world_info_base64)
......
......@@ -18,8 +18,8 @@ from copy import deepcopy
import signal
import time
from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner
from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER, SLURM_LAUNCHER
from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner
from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER, SLURM_LAUNCHER, MPICH_LAUNCHER
from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
from ..nebula.constants import NEBULA_EXPORT_ENVS
from ..utils import logger
......@@ -114,7 +114,7 @@ def parse_args(args=None):
default=PDSH_LAUNCHER,
type=str,
help="(optional) choose launcher backend for multi-node "
"training. Options currently include PDSH, OpenMPI, MVAPICH, SLURM.")
"training. Options currently include PDSH, OpenMPI, MVAPICH, SLURM, MPICH.")
parser.add_argument("--launcher_args",
default="",
......@@ -511,6 +511,8 @@ def main(args=None):
runner = PDSHRunner(args, world_info_base64)
elif args.launcher == OPENMPI_LAUNCHER:
runner = OpenMPIRunner(args, world_info_base64, resource_pool)
elif args.launcher == MPICH_LAUNCHER:
runner = MPICHRunner(args, world_info_base64, resource_pool)
elif args.launcher == MVAPICH_LAUNCHER:
runner = MVAPICHRunner(args, world_info_base64, resource_pool)
elif args.launcher == SLURM_LAUNCHER:
......
......@@ -31,6 +31,13 @@ def test_openmpi_runner(runner_info):
assert cmd[0] == 'mpirun'
def test_mpich_runner(runner_info):
env, resource_pool, world_info, args = runner_info
runner = mnrunner.MPICHRunner(args, world_info, resource_pool)
cmd = runner.get_cmd(env, resource_pool)
assert cmd[0] == 'mpirun'
def test_slurm_runner(runner_info):
env, resource_pool, world_info, args = runner_info
runner = mnrunner.SlurmRunner(args, world_info, resource_pool)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册