launch.py 2.3 KB
Newer Older
W
wandongdong 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""launch train script"""
import os
import sys
W
wandongdong 已提交
18
import subprocess
W
wandongdong 已提交
19
import shutil
P
Payne 已提交
20
from args import launch_parse_args
W
wandongdong 已提交
21 22 23

def main():
    print("start", __file__)
P
Payne 已提交
24
    args = launch_parse_args()
W
wandongdong 已提交
25 26 27 28 29 30 31
    print(args)
    visible_devices = args.visible_devices.split(',')
    assert os.path.isfile(args.training_script)
    assert len(visible_devices) >= args.nproc_per_node
    print('visible_devices:{}'.format(visible_devices))

    # spawn the processes
W
wandongdong 已提交
32 33
    processes = []
    cmds = []
W
wandongdong 已提交
34 35 36
    log_files = []
    env = os.environ.copy()
    env['RANK_SIZE'] = str(args.nproc_per_node)
W
wandongdong 已提交
37
    cur_path = os.getcwd()
W
wandongdong 已提交
38
    for rank_id in range(0, args.nproc_per_node):
W
wandongdong 已提交
39
        os.chdir(cur_path)
40
        device_id = visible_devices[rank_id]
W
wandongdong 已提交
41
        device_dir = os.path.join(cur_path, 'device{}'.format(rank_id))
W
wandongdong 已提交
42 43 44 45 46
        env['RANK_ID'] = str(rank_id)
        env['DEVICE_ID'] = str(device_id)
        if os.path.exists(device_dir):
            shutil.rmtree(device_dir)
        os.mkdir(device_dir)
W
wandongdong 已提交
47
        os.chdir(device_dir)
W
wandongdong 已提交
48
        cmd = [sys.executable, '-u']
P
Payne 已提交
49 50
        cmd.append(args.train_script)
        cmd.extend(args.train_script_args)
W
wandongdong 已提交
51
        log_file = open('{dir}/log{id}.log'.format(dir=device_dir, id=rank_id), 'w')
W
wandongdong 已提交
52
        process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env)
W
wandongdong 已提交
53
        processes.append(process)
W
wandongdong 已提交
54 55 56
        cmds.append(cmd)
        log_files.append(log_file)
    for process, cmd, log_file in zip(processes, cmds, log_files):
W
wandongdong 已提交
57 58 59
        process.wait()
        if process.returncode != 0:
            raise subprocess.CalledProcessError(returncode=process, cmd=cmd)
W
wandongdong 已提交
60
        log_file.close()
W
wandongdong 已提交
61 62 63 64


if __name__ == "__main__":
    main()