run_mp.py 4.1 KB
Newer Older
W
Wu Yi 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import subprocess
import os
import sys
import time
import argparse

default_envs = {
    "PADDLE_TRAINER_ENDPOINTS":
    "127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177",
    "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
    "PATH": os.getenv("PATH"),
    "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
    "PADDLE_TRAINERS_NUM": "8",
    "NCCL_DEBUG": "INFO",
    "GLOG_v": "0",
    "NCCL_SOCKET_IFNAME": "eth0",
    "NCCL_IB_GID_INDEX": "3",
    "NCCL_IB_RETRY_CNT": "0",
}

GPUS = 8


def start_procs(gpus, cmd, log_dir):
    procs = []
    log_fns = []
    os.system("mkdir -p %s" % log_dir)
    # ======== update parent envs =======
    for k, v in os.environ.items():
        if k.startswith("FLAGS_") or k.startswith("NCCL_") or \
            k.startswith("GLOG_"):
            default_envs[k] = v

    # ======== for dist training =======
    node_trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
    current_ip = os.getenv("POD_IP", "127.0.0.1")
    trainer_ips = os.getenv("PADDLE_TRAINERS", current_ip).split(",")
    num_nodes = len(trainer_ips)
    all_nodes_devices_endpoints = ""
    for n in trainer_ips:
        for i in range(gpus):
            if all_nodes_devices_endpoints:
                all_nodes_devices_endpoints += ","
            all_nodes_devices_endpoints += "%s:617%d" % (n, i)
    nranks = num_nodes * gpus
    # ======== for dist training =======

    for i in range(gpus):
        curr_env = {}
        curr_env.update(default_envs)
        curr_env.update({
            "FLAGS_selected_gpus": "%d" % i,
            "PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
            "PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
            # nranks
            "PADDLE_TRAINERS_NUM": "%d" % nranks,
            "PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
        })

        print("starting process ", i, cmd, curr_env)
        fn = open("%s/workerlog.%d" % (log_dir, i), "w")
        log_fns.append(fn)
        procs.append(
            subprocess.Popen(
                cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env))

    for i in range(gpus):
        try:
            procs[i].communicate()
            procs[i].terminate()
            log_fns[i].close()
        except:
            pass


def main():
    parser = argparse.ArgumentParser(
        description='''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode,
see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
And your train program must read environment variables below in order to let different
process init properly:
FLAGS_selected_gpus
PADDLE_TRAINER_ID
PADDLE_CURRENT_ENDPOINT
PADDLE_TRAINERS_NUM
PADDLE_TRAINER_ENDPOINTS
POD_IP (current node ip address, not needed for local training)
''')
    parser.add_argument(
        '--gpus',
        type=int,
        default=8,
        help='start number of processes for every gpu')
    parser.add_argument(
        '--cmd',
        type=str,
        default="",
        help='command to run for each process, e.g. python train.py --lr 0.1')
    parser.add_argument(
        '--log_dir',
        type=str,
        default="mylog",
        help='directory to put logs per process.')
    args = parser.parse_args()
    if args.cmd == "":
        parser.print_help()
        exit(0)
    start_procs(args.gpus, args.cmd, args.log_dir)


if __name__ == "__main__":
    main()