未验证 提交 9252aa41 编写于 作者: W Wu Yi 提交者: GitHub

add multi process start script (#15381)

* add multi process start script test=develop

* refine tool test=develop
上级 530869f8
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import subprocess
import os
import sys
import time
import argparse
default_envs = {
"PADDLE_TRAINER_ENDPOINTS":
"127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177",
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"PATH": os.getenv("PATH"),
"LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
"PADDLE_TRAINERS_NUM": "8",
"NCCL_DEBUG": "INFO",
"GLOG_v": "0",
"NCCL_SOCKET_IFNAME": "eth0",
"NCCL_IB_GID_INDEX": "3",
"NCCL_IB_RETRY_CNT": "0",
}
GPUS = 8
def start_procs(gpus, cmd, log_dir):
procs = []
log_fns = []
os.system("mkdir -p %s" % log_dir)
# ======== update parent envs =======
for k, v in os.environ.items():
if k.startswith("FLAGS_") or k.startswith("NCCL_") or \
k.startswith("GLOG_"):
default_envs[k] = v
# ======== for dist training =======
node_trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
current_ip = os.getenv("POD_IP", "127.0.0.1")
trainer_ips = os.getenv("PADDLE_TRAINERS", current_ip).split(",")
num_nodes = len(trainer_ips)
all_nodes_devices_endpoints = ""
for n in trainer_ips:
for i in range(gpus):
if all_nodes_devices_endpoints:
all_nodes_devices_endpoints += ","
all_nodes_devices_endpoints += "%s:617%d" % (n, i)
nranks = num_nodes * gpus
# ======== for dist training =======
for i in range(gpus):
curr_env = {}
curr_env.update(default_envs)
curr_env.update({
"FLAGS_selected_gpus": "%d" % i,
"PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
"PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
# nranks
"PADDLE_TRAINERS_NUM": "%d" % nranks,
"PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
})
print("starting process ", i, cmd, curr_env)
fn = open("%s/workerlog.%d" % (log_dir, i), "w")
log_fns.append(fn)
procs.append(
subprocess.Popen(
cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env))
for i in range(gpus):
try:
procs[i].communicate()
procs[i].terminate()
log_fns[i].close()
except:
pass
def main():
parser = argparse.ArgumentParser(
description='''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode,
see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
And your train program must read environment variables below in order to let different
process init properly:
FLAGS_selected_gpus
PADDLE_TRAINER_ID
PADDLE_CURRENT_ENDPOINT
PADDLE_TRAINERS_NUM
PADDLE_TRAINER_ENDPOINTS
POD_IP (current node ip address, not needed for local training)
''')
parser.add_argument(
'--gpus',
type=int,
default=8,
help='start number of processes for every gpu')
parser.add_argument(
'--cmd',
type=str,
default="",
help='command to run for each process, e.g. python train.py --lr 0.1')
parser.add_argument(
'--log_dir',
type=str,
default="mylog",
help='directory to put logs per process.')
args = parser.parse_args()
if args.cmd == "":
parser.print_help()
exit(0)
start_procs(args.gpus, args.cmd, args.log_dir)
if __name__ == "__main__":
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册