Use available ports instead of static ports. (#22553)

4b40edf3 · gongweibao · GitHub · ad9c8f6d · 4b40edf3 · 4b40edf3
3 changed file
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -46,6 +46,8 @@ import six
 import copy
 from argparse import ArgumentParser, REMAINDER
 import paddle.fluid as fluid
+from contextlib import closing
+import socket

 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
@@ -63,6 +65,32 @@ def _print_arguments(args):
    print("------------------------------------------------")


+def find_free_ports(num):
+    def __free_port():
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.bind(('', 0))
+            return s.getsockname()[1]
+
+    port_set = set()
+    step = 0
+    while True:
+        port = __free_port()
+        if port not in port_set:
+            port_set.add(port)
+
+        if len(port_set) >= num:
+            return port_set
+
+        step += 1
+        if step > 100:
+            print(
+                "can't find avilable port and use the specified static port now!"
+            )
+            return None
+
+    return None
+
+
 def _parse_args():
    """
    Helper function parsing the command line options
@@ -101,7 +129,7 @@ POD_IP (current node ip address, not needed for local training)
    parser.add_argument(
        "--started_port",
        type=int,
-        default=6170,
+        default=None,
        help="The trainer's started port on a single node")

    parser.add_argument(
@@ -212,12 +240,29 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
                logger.warning("Use Cloud specified port:{}.".format(
                    cloud_paddle_port))

+    free_ports = None
+    if not args.use_paddlecloud and num_nodes <= 1 and args.started_port is None:
+        free_ports = find_free_ports(selected_gpus_num)
+        if free_ports is not None:
+            free_ports = list(free_ports)
+            args.started_port = free_ports[0]
+
+    if args.started_port is None:
+        args.started_port = 6170
+
+    if free_ports is None:
+        free_ports = [
+            x
+            for x in range(args.started_port, args.started_port +
+                           selected_gpus_num)
+        ]
+
    trainers_endpoints = ""
    for ip in node_ips:
-        for i in range(selected_gpus_num):
+        for i in range(0, selected_gpus_num):
            if trainers_endpoints != "":
                trainers_endpoints += ","
-            trainers_endpoints += "%s:%d" % (ip, args.started_port + i)
+            trainers_endpoints += "%s:%d" % (ip, free_ports[i])

    nranks = num_nodes * selected_gpus_num

@@ -244,7 +289,7 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
            "FLAGS_selected_gpus": "%s" % selected_gpus[i],
            "PADDLE_TRAINER_ID": "%d" % rank,
            "PADDLE_CURRENT_ENDPOINT":
-            "%s:%d" % (current_node_ip, args.started_port + i),
+            "%s:%d" % (current_node_ip, free_ports[i]),
            "PADDLE_TRAINERS_NUM": "%d" % nranks,
            "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
        })

--- a/python/paddle/fluid/tests/unittests/find_ports.py
+++ b/python/paddle/fluid/tests/unittests/find_ports.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+
+
+def train():
+    selected_gpus = os.getenv("FLAGS_selected_gpus")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    worker_endpoints = worker_endpoints_env
+    trainers_num = len(worker_endpoints.split(','))
+
+    name = "worker_endpoints:{}" \
+        .format(worker_endpoints)
+
+    print(name)
+    file_name = os.getenv("PADDLE_LAUNCH_LOG")
+    if file_name is None or file_name == "":
+        print("can't find PADDLE_LAUNCH_LOG")
+        sys.exit(1)
+    with open("{}_{}.log".format(file_name, trainer_id), "w") as f:
+        f.write(name)
+
+
+if __name__ == '__main__':
+    train()
--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
@@ -69,3 +69,13 @@ else
    echo "trainer 1 not terminate as planned"
    exit -1
 fi
+
+#test for random ports
+file_0_0="test_launch_filelock_0_0.log"
+file_1_0="test_launch_filelock_1_0.log"
+rm -rf $file_0_0 $file_0_1
+
+distributed_args="--selected_gpus=0,1 --log_dir=testlog"
+export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} find_ports.py
+str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"