提交 24ff160f 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!817 remove amp setting and add RANK_TABLE_FILE

Merge pull request !817 from wandongdong/r0.2
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
"""launch train script""" """launch train script"""
import os import os
import sys import sys
import subprocess
import json import json
from argparse import ArgumentParser from argparse import ArgumentParser
...@@ -125,25 +124,19 @@ def main(): ...@@ -125,25 +124,19 @@ def main():
sys.stdout.flush() sys.stdout.flush()
# spawn the processes # spawn the processes
current_env = os.environ.copy()
current_env["RANK_SIZE"] = str(args.nproc_per_node)
if args.nproc_per_node > 1:
current_env["MINDSPORE_HCCL_CONFIG_PATH"] = table_fn
processes = []
cmds = []
for rank_id in range(0, args.nproc_per_node): for rank_id in range(0, args.nproc_per_node):
current_env["RANK_ID"] = str(rank_id) device_id = visible_devices[rank_id]
current_env["DEVICE_ID"] = visible_devices[rank_id] device_dir = os.path.join(os.getcwd(), 'device{}'.format(rank_id))
cmd = [sys.executable, "-u"] rank_process = 'export RANK_SIZE={} && export RANK_ID={} && export DEVICE_ID={} && '.format(args.nproc_per_node,
cmd.append(args.training_script) rank_id, device_id)
cmd.extend(args.training_script_args) if args.nproc_per_node > 1:
process = subprocess.Popen(cmd, env=current_env) rank_process += 'export MINDSPORE_HCCL_CONFIG_PATH={} && '.format(table_fn)
processes.append(process) rank_process += 'export RANK_TABLE_FILE={} && '.format(table_fn)
cmds.append(cmd) rank_process += 'rm -rf {dir} && mkdir {dir} && cd {dir} && python {script} '.format(dir=device_dir,
for process, cmd in zip(processes, cmds): script=args.training_script
process.wait() )
if process.returncode != 0: rank_process += ' '.join(args.training_script_args) + ' > log{}.log 2>&1 &'.format(rank_id)
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd) os.system(rank_process)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -135,8 +135,7 @@ if __name__ == '__main__': ...@@ -135,8 +135,7 @@ if __name__ == '__main__':
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
config.weight_decay, config.loss_scale) config.weight_decay, config.loss_scale)
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, amp_level='O0', model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale)
keep_batchnorm_fp32=False)
cb = None cb = None
if rank_id == 0: if rank_id == 0:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册