distribute launch : add use_paddlecloud argument (#19273)

distribute launch : add use_paddlecloud argument

distribute launch : add use_paddlecloud argument (#19273)
distribute launch : add use_paddlecloud argument
0865b5a9 · danleifeng · gongweibao · 76c95af0 · 0865b5a9 · 0865b5a9
3 changed file
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -14,11 +14,9 @@
 """
 paddle.distributed.launch is a module that spawns multiple distributed 
 process on each trainning node for gpu trainning.
 Usage:
    In both of single node training or multiple node training, this module 
 launch a process on each of the given gpu card.
    1. for single node trainning with all visible gpu cards:
       python -m paddle.distributed.launch \
         your_training_py (arg1 arg2 and all others)
@@ -26,13 +24,11 @@ launch a process on each of the given gpu card.
    2. for single node trainning with [0,4) cards
       python -m paddle.distributed.launch --selected_gpus="0,1,2,3" \
         your_training_py (arg1 arg2 and all others)
    3. for mulitple node training such as two node:192.168.0.16, 192.168.0.17
        on 192.168.0.16:
            python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
                --node_ip=192.168.0.16 \
                your_training_py (arg1 arg2 and all others)
        on 192.168.0.17:
            python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
                --node_ip=192.168.0.17 \
@@ -44,6 +40,7 @@ import sys
 from sys import version
 import subprocess
 import os
+import warnings
 import six
 import copy
 from argparse import ArgumentParser, REMAINDER
@@ -76,19 +73,22 @@ PADDLE_TRAINER_ENDPOINTS
 POD_IP (current node ip address, not needed for local training)
 ''')
-    # Optional arguments for the launch helper
+    #Optional arguments for the launch helper
    parser.add_argument(
        "--cluster_node_ips",
        type=str,
        default="127.0.0.1",
        help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
    parser.add_argument(
        "--node_ip",
        type=str,
        default="127.0.0.1",
        help="The current node ip. ")
+    parser.add_argument(
+        "--use_paddlecloud",
+        type=bool,
+        default="False",
+        help="wheter to use paddlecloud platform to run your multi-process job.")
    parser.add_argument(
        "--started_port",
        type=int,
@@ -115,7 +115,7 @@ POD_IP (current node ip address, not needed for local training)
        help="The path for each process's log.If it's not setted, the log will printed to default pipe."
    )
-    # positional
+    #positional
    parser.add_argument(
        "training_script",
        type=str,
@@ -124,7 +124,7 @@ POD_IP (current node ip address, not needed for local training)
        "followed by all the arguments for the "
        "training script")
-    # rest from the training program
+    #rest from the training program
    parser.add_argument('training_script_args', nargs=REMAINDER)
    return parser.parse_args()
@@ -140,6 +140,32 @@ def start_procs(args):
    current_node_ip = args.node_ip
    node_ips = [x.strip() for x in args.cluster_node_ips.split(',')]
    node_id = node_ips.index(current_node_ip)
+    if args.use_paddlecloud:
+        trainer_nums = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        if trainer_nums != 1:
+            #you can automatically get ip info while using paddlecloud multi nodes mode.
+            current_node_ip = os.getenv("POD_IP")
+            assert current_node_ip is not None, "POD_IP should not be None"
+            node_ips = os.getenv("PADDLE_TRAINERS")
+            assert node_ips is not None, "PADDLE_TRAINERS should not be None"
+            node_ips = node_ips.split(",")
+            node_id = os.getenv("PADDLE_TRAINER_ID")
+            assert node_id is not None, "PADDLE_TRAINER_ID should not be None"
+            node_id = int(node_id)
+            if args.node_ip != "127.0.0.1" and current_node_ip != args.node_ip:
+                warnings.warn(
+                    "Please NOTE: When using paddlecloud, current_node_ip is \
+automatically got from POD_IP. Your input node_ip: {} doesn't equals to \
+current_node_ip: {} from paddlecloud environment."
+                    .format(args.node_ip, current_node_ip))
+            if args.cluster_node_ips != "127.0.0.1" and args.cluster_node_ips != ",".join(
+                    node_ips):
+                warnings.warn(
+                    "Please NOTE: When using paddlecloud, cluster_node_ips is \
+automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
+Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
+paddlecloud environment.".format(args.cluster_node_ips, node_ips))
    num_nodes = len(node_ips)
    if args.selected_gpus is None:
@@ -164,10 +190,10 @@ def start_procs(args):
              ", node_ips:", node_ips, ", nranks:", nranks)
    current_env = copy.copy(default_env)
-    # paddle broadcast ncclUniqueId use socket, and
+    #paddle broadcast ncclUniqueId use socket, and
-    # proxy maybe make trainers unreachable, so delete them.
+    #proxy maybe make trainers unreachable, so delete them.
-    # if we set them to "", grpc will log error message "bad uri"
+    #if we set them to "", grpc will log error message "bad uri"
-    # so just delete them.
+    #so just delete them.
    current_env.pop("http_proxy", None)
    current_env.pop("https_proxy", None)

--- a/python/paddle/fluid/tests/unittests/multi_process.py
+++ b/python/paddle/fluid/tests/unittests/multi_process.py
@@ -20,14 +20,14 @@ def train():
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-    worker_endpoints = worker_endpoints_env.split(",")
+    worker_endpoints = worker_endpoints_env
-    trainers_num = len(worker_endpoints)
+    trainers_num = len(worker_endpoints.split(','))
    name = "selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
        .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
    print(name)
-    with open("multi_process.check.log", "w") as f:
+    with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
        f.write(name)

--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
 #!/bin/bash
-set -e
+set -ex
 # use default values
 python -m paddle.distributed.launch multi_process.py
-# use specified values
+# use paddlecloud
-cluster_node_ips="127.0.0.1"
+cluster_node_ips="10.0.0.1"
-node_ip="127.0.0.1"
+node_ip="10.0.0.1"
+export PADDLE_TRAINERS_NUM=2
+export POD_IP=127.0.0.1
+export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
+export PADDLE_TRAINER_ID=0
-distributed_args="--cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1 --log_dir testlog"
+distributed_args="--use_paddlecloud True --cluster_node_ips ${cluster_node_ips} --node_ip ${node_ip} --selected_gpus=0,1 --log_dir testlog"
 python -m paddle.distributed.launch ${distributed_args} multi_process.py
-str1="selected_gpus:0 worker_endpoints:['127.0.0.1:6170', '127.0.0.1:6171'] trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0"
+str1="selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
-str2="selected_gpus:1 worker_endpoints:['127.0.0.1:6170', '127.0.0.1:6171'] trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1"
+str2="selected_gpus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6171 trainer_id:1"
-file="multi_process.check.log"
+file_0="multi_process.check_0.log"
+file_1="multi_process.check_1.log"
-if ! grep -q "$str1" "$file"; then
+echo "paddlecloud params test"
+if grep -q "$str1" "$file_0"; then
    echo "find trainer 0"
 else
    echo "not find trainer 0"
    exit -1
 fi
-if ! grep -q "$str2" "$file"; then
+if grep -q "$str2" "$file_1"; then
    echo "find trainer 1"
 else
-    echo "not find trainer 0"
+    echo "not find trainer 1"
    exit -1
 fi