提交 0865b5a9 编写于 作者: D danleifeng 提交者: gongweibao

distribute launch : add use_paddlecloud argument (#19273)

distribute launch : add use_paddlecloud argument 
上级 76c95af0
......@@ -14,11 +14,9 @@
"""
paddle.distributed.launch is a module that spawns multiple distributed
process on each trainning node for gpu trainning.
Usage:
In both of single node training or multiple node training, this module
launch a process on each of the given gpu card.
1. for single node trainning with all visible gpu cards:
python -m paddle.distributed.launch \
your_training_py (arg1 arg2 and all others)
......@@ -26,13 +24,11 @@ launch a process on each of the given gpu card.
2. for single node trainning with [0,4) cards
python -m paddle.distributed.launch --selected_gpus="0,1,2,3" \
your_training_py (arg1 arg2 and all others)
3. for mulitple node training such as two node:192.168.0.16, 192.168.0.17
on 192.168.0.16:
python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
--node_ip=192.168.0.16 \
your_training_py (arg1 arg2 and all others)
on 192.168.0.17:
python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
--node_ip=192.168.0.17 \
......@@ -44,6 +40,7 @@ import sys
from sys import version
import subprocess
import os
import warnings
import six
import copy
from argparse import ArgumentParser, REMAINDER
......@@ -76,19 +73,22 @@ PADDLE_TRAINER_ENDPOINTS
POD_IP (current node ip address, not needed for local training)
''')
# Optional arguments for the launch helper
#Optional arguments for the launch helper
parser.add_argument(
"--cluster_node_ips",
type=str,
default="127.0.0.1",
help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
parser.add_argument(
"--node_ip",
type=str,
default="127.0.0.1",
help="The current node ip. ")
parser.add_argument(
"--use_paddlecloud",
type=bool,
default="False",
help="wheter to use paddlecloud platform to run your multi-process job.")
parser.add_argument(
"--started_port",
type=int,
......@@ -115,7 +115,7 @@ POD_IP (current node ip address, not needed for local training)
help="The path for each process's log.If it's not setted, the log will printed to default pipe."
)
# positional
#positional
parser.add_argument(
"training_script",
type=str,
......@@ -124,7 +124,7 @@ POD_IP (current node ip address, not needed for local training)
"followed by all the arguments for the "
"training script")
# rest from the training program
#rest from the training program
parser.add_argument('training_script_args', nargs=REMAINDER)
return parser.parse_args()
......@@ -140,6 +140,32 @@ def start_procs(args):
current_node_ip = args.node_ip
node_ips = [x.strip() for x in args.cluster_node_ips.split(',')]
node_id = node_ips.index(current_node_ip)
if args.use_paddlecloud:
trainer_nums = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
if trainer_nums != 1:
#you can automatically get ip info while using paddlecloud multi nodes mode.
current_node_ip = os.getenv("POD_IP")
assert current_node_ip is not None, "POD_IP should not be None"
node_ips = os.getenv("PADDLE_TRAINERS")
assert node_ips is not None, "PADDLE_TRAINERS should not be None"
node_ips = node_ips.split(",")
node_id = os.getenv("PADDLE_TRAINER_ID")
assert node_id is not None, "PADDLE_TRAINER_ID should not be None"
node_id = int(node_id)
if args.node_ip != "127.0.0.1" and current_node_ip != args.node_ip:
warnings.warn(
"Please NOTE: When using paddlecloud, current_node_ip is \
automatically got from POD_IP. Your input node_ip: {} doesn't equals to \
current_node_ip: {} from paddlecloud environment."
.format(args.node_ip, current_node_ip))
if args.cluster_node_ips != "127.0.0.1" and args.cluster_node_ips != ",".join(
node_ips):
warnings.warn(
"Please NOTE: When using paddlecloud, cluster_node_ips is \
automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
paddlecloud environment.".format(args.cluster_node_ips, node_ips))
num_nodes = len(node_ips)
if args.selected_gpus is None:
......@@ -164,10 +190,10 @@ def start_procs(args):
", node_ips:", node_ips, ", nranks:", nranks)
current_env = copy.copy(default_env)
# paddle broadcast ncclUniqueId use socket, and
# proxy maybe make trainers unreachable, so delete them.
# if we set them to "", grpc will log error message "bad uri"
# so just delete them.
#paddle broadcast ncclUniqueId use socket, and
#proxy maybe make trainers unreachable, so delete them.
#if we set them to "", grpc will log error message "bad uri"
#so just delete them.
current_env.pop("http_proxy", None)
current_env.pop("https_proxy", None)
......
......@@ -20,14 +20,14 @@ def train():
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
worker_endpoints = worker_endpoints_env.split(",")
trainers_num = len(worker_endpoints)
worker_endpoints = worker_endpoints_env
trainers_num = len(worker_endpoints.split(','))
name = "selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
.format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
print(name)
with open("multi_process.check.log", "w") as f:
with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
f.write(name)
......
#!/bin/bash
set -e
set -ex
# use default values
python -m paddle.distributed.launch multi_process.py
# use specified values
cluster_node_ips="127.0.0.1"
node_ip="127.0.0.1"
# use paddlecloud
cluster_node_ips="10.0.0.1"
node_ip="10.0.0.1"
export PADDLE_TRAINERS_NUM=2
export POD_IP=127.0.0.1
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0
distributed_args="--cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1 --log_dir testlog"
distributed_args="--use_paddlecloud True --cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1 --log_dir testlog"
python -m paddle.distributed.launch ${distributed_args} multi_process.py
str1="selected_gpus:0 worker_endpoints:['127.0.0.1:6170', '127.0.0.1:6171'] trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:['127.0.0.1:6170', '127.0.0.1:6171'] trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1"
file="multi_process.check.log"
str1="selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6171 trainer_id:1"
file_0="multi_process.check_0.log"
file_1="multi_process.check_1.log"
if ! grep -q "$str1" "$file"; then
echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then
echo "find trainer 0"
else
echo "not find trainer 0"
exit -1
fi
if ! grep -q "$str2" "$file"; then
if grep -q "$str2" "$file_1"; then
echo "find trainer 1"
else
echo "not find trainer 0"
echo "not find trainer 1"
exit -1
fi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册