From 0865b5a9a03fd3dcd0e43375c6c58d3265c3d542 Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Tue, 20 Aug 2019 13:04:16 +0800 Subject: [PATCH] distribute launch : add use_paddlecloud argument (#19273) distribute launch : add use_paddlecloud argument --- python/paddle/distributed/launch.py | 52 ++++++++++++++----- .../fluid/tests/unittests/multi_process.py | 6 +-- .../fluid/tests/unittests/test_launch.sh | 29 ++++++----- 3 files changed, 59 insertions(+), 28 deletions(-) diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index 82322c657e5..8f9d080b6bb 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -14,11 +14,9 @@ """ paddle.distributed.launch is a module that spawns multiple distributed process on each trainning node for gpu trainning. - Usage: In both of single node training or multiple node training, this module launch a process on each of the given gpu card. - 1. for single node trainning with all visible gpu cards: python -m paddle.distributed.launch \ your_training_py (arg1 arg2 and all others) @@ -26,13 +24,11 @@ launch a process on each of the given gpu card. 2. for single node trainning with [0,4) cards python -m paddle.distributed.launch --selected_gpus="0,1,2,3" \ your_training_py (arg1 arg2 and all others) - 3. for mulitple node training such as two node:192.168.0.16, 192.168.0.17 on 192.168.0.16: python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \ --node_ip=192.168.0.16 \ your_training_py (arg1 arg2 and all others) - on 192.168.0.17: python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \ --node_ip=192.168.0.17 \ @@ -44,6 +40,7 @@ import sys from sys import version import subprocess import os +import warnings import six import copy from argparse import ArgumentParser, REMAINDER @@ -76,19 +73,22 @@ PADDLE_TRAINER_ENDPOINTS POD_IP (current node ip address, not needed for local training) ''') - # Optional arguments for the launch helper + #Optional arguments for the launch helper parser.add_argument( "--cluster_node_ips", type=str, default="127.0.0.1", help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..") - parser.add_argument( "--node_ip", type=str, default="127.0.0.1", help="The current node ip. ") - + parser.add_argument( + "--use_paddlecloud", + type=bool, + default="False", + help="wheter to use paddlecloud platform to run your multi-process job.") parser.add_argument( "--started_port", type=int, @@ -115,7 +115,7 @@ POD_IP (current node ip address, not needed for local training) help="The path for each process's log.If it's not setted, the log will printed to default pipe." ) - # positional + #positional parser.add_argument( "training_script", type=str, @@ -124,7 +124,7 @@ POD_IP (current node ip address, not needed for local training) "followed by all the arguments for the " "training script") - # rest from the training program + #rest from the training program parser.add_argument('training_script_args', nargs=REMAINDER) return parser.parse_args() @@ -140,6 +140,32 @@ def start_procs(args): current_node_ip = args.node_ip node_ips = [x.strip() for x in args.cluster_node_ips.split(',')] node_id = node_ips.index(current_node_ip) + if args.use_paddlecloud: + trainer_nums = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) + if trainer_nums != 1: + #you can automatically get ip info while using paddlecloud multi nodes mode. + current_node_ip = os.getenv("POD_IP") + assert current_node_ip is not None, "POD_IP should not be None" + node_ips = os.getenv("PADDLE_TRAINERS") + assert node_ips is not None, "PADDLE_TRAINERS should not be None" + node_ips = node_ips.split(",") + node_id = os.getenv("PADDLE_TRAINER_ID") + assert node_id is not None, "PADDLE_TRAINER_ID should not be None" + node_id = int(node_id) + + if args.node_ip != "127.0.0.1" and current_node_ip != args.node_ip: + warnings.warn( + "Please NOTE: When using paddlecloud, current_node_ip is \ +automatically got from POD_IP. Your input node_ip: {} doesn't equals to \ +current_node_ip: {} from paddlecloud environment." + .format(args.node_ip, current_node_ip)) + if args.cluster_node_ips != "127.0.0.1" and args.cluster_node_ips != ",".join( + node_ips): + warnings.warn( + "Please NOTE: When using paddlecloud, cluster_node_ips is \ +automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\ +Your input cluster_node_ips: {} doesn't equals to IPs: {} from \ +paddlecloud environment.".format(args.cluster_node_ips, node_ips)) num_nodes = len(node_ips) if args.selected_gpus is None: @@ -164,10 +190,10 @@ def start_procs(args): ", node_ips:", node_ips, ", nranks:", nranks) current_env = copy.copy(default_env) - # paddle broadcast ncclUniqueId use socket, and - # proxy maybe make trainers unreachable, so delete them. - # if we set them to "", grpc will log error message "bad uri" - # so just delete them. + #paddle broadcast ncclUniqueId use socket, and + #proxy maybe make trainers unreachable, so delete them. + #if we set them to "", grpc will log error message "bad uri" + #so just delete them. current_env.pop("http_proxy", None) current_env.pop("https_proxy", None) diff --git a/python/paddle/fluid/tests/unittests/multi_process.py b/python/paddle/fluid/tests/unittests/multi_process.py index 176439626fe..f5870edf96c 100644 --- a/python/paddle/fluid/tests/unittests/multi_process.py +++ b/python/paddle/fluid/tests/unittests/multi_process.py @@ -20,14 +20,14 @@ def train(): trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") - worker_endpoints = worker_endpoints_env.split(",") - trainers_num = len(worker_endpoints) + worker_endpoints = worker_endpoints_env + trainers_num = len(worker_endpoints.split(',')) name = "selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id) print(name) - with open("multi_process.check.log", "w") as f: + with open("multi_process.check_{}.log".format(trainer_id), "w") as f: f.write(name) diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh index 01b620d01df..87dc9bad96f 100644 --- a/python/paddle/fluid/tests/unittests/test_launch.sh +++ b/python/paddle/fluid/tests/unittests/test_launch.sh @@ -1,30 +1,35 @@ #!/bin/bash -set -e - +set -ex # use default values python -m paddle.distributed.launch multi_process.py -# use specified values -cluster_node_ips="127.0.0.1" -node_ip="127.0.0.1" +# use paddlecloud +cluster_node_ips="10.0.0.1" +node_ip="10.0.0.1" +export PADDLE_TRAINERS_NUM=2 +export POD_IP=127.0.0.1 +export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 +export PADDLE_TRAINER_ID=0 -distributed_args="--cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1 --log_dir testlog" +distributed_args="--use_paddlecloud True --cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1 --log_dir testlog" python -m paddle.distributed.launch ${distributed_args} multi_process.py -str1="selected_gpus:0 worker_endpoints:['127.0.0.1:6170', '127.0.0.1:6171'] trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0" -str2="selected_gpus:1 worker_endpoints:['127.0.0.1:6170', '127.0.0.1:6171'] trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1" -file="multi_process.check.log" +str1="selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0" +str2="selected_gpus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6171 trainer_id:1" +file_0="multi_process.check_0.log" +file_1="multi_process.check_1.log" -if ! grep -q "$str1" "$file"; then +echo "paddlecloud params test" +if grep -q "$str1" "$file_0"; then echo "find trainer 0" else echo "not find trainer 0" exit -1 fi -if ! grep -q "$str2" "$file"; then +if grep -q "$str2" "$file_1"; then echo "find trainer 1" else - echo "not find trainer 0" + echo "not find trainer 1" exit -1 fi -- GitLab