diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index c4be745d524702a635f17ad6126953220a33650e..cd27b05b63f52d6cd381768387a87f649ebd9bac 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -46,6 +46,8 @@ import six import copy from argparse import ArgumentParser, REMAINDER import paddle.fluid as fluid +from contextlib import closing +import socket logger = logging.getLogger() logger.setLevel(logging.INFO) @@ -63,6 +65,32 @@ def _print_arguments(args): print("------------------------------------------------") +def find_free_ports(num): + def __free_port(): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + return s.getsockname()[1] + + port_set = set() + step = 0 + while True: + port = __free_port() + if port not in port_set: + port_set.add(port) + + if len(port_set) >= num: + return port_set + + step += 1 + if step > 100: + print( + "can't find avilable port and use the specified static port now!" + ) + return None + + return None + + def _parse_args(): """ Helper function parsing the command line options @@ -101,7 +129,7 @@ POD_IP (current node ip address, not needed for local training) parser.add_argument( "--started_port", type=int, - default=6170, + default=None, help="The trainer's started port on a single node") parser.add_argument( @@ -212,12 +240,29 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) logger.warning("Use Cloud specified port:{}.".format( cloud_paddle_port)) + free_ports = None + if not args.use_paddlecloud and num_nodes <= 1 and args.started_port is None: + free_ports = find_free_ports(selected_gpus_num) + if free_ports is not None: + free_ports = list(free_ports) + args.started_port = free_ports[0] + + if args.started_port is None: + args.started_port = 6170 + + if free_ports is None: + free_ports = [ + x + for x in range(args.started_port, args.started_port + + selected_gpus_num) + ] + trainers_endpoints = "" for ip in node_ips: - for i in range(selected_gpus_num): + for i in range(0, selected_gpus_num): if trainers_endpoints != "": trainers_endpoints += "," - trainers_endpoints += "%s:%d" % (ip, args.started_port + i) + trainers_endpoints += "%s:%d" % (ip, free_ports[i]) nranks = num_nodes * selected_gpus_num @@ -244,7 +289,7 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) "FLAGS_selected_gpus": "%s" % selected_gpus[i], "PADDLE_TRAINER_ID": "%d" % rank, "PADDLE_CURRENT_ENDPOINT": - "%s:%d" % (current_node_ip, args.started_port + i), + "%s:%d" % (current_node_ip, free_ports[i]), "PADDLE_TRAINERS_NUM": "%d" % nranks, "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints }) diff --git a/python/paddle/fluid/tests/unittests/find_ports.py b/python/paddle/fluid/tests/unittests/find_ports.py new file mode 100644 index 0000000000000000000000000000000000000000..b467a49213dd02b3160077ec63e79bda90094209 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/find_ports.py @@ -0,0 +1,41 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time + + +def train(): + selected_gpus = os.getenv("FLAGS_selected_gpus") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") + current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + worker_endpoints = worker_endpoints_env + trainers_num = len(worker_endpoints.split(',')) + + name = "worker_endpoints:{}" \ + .format(worker_endpoints) + + print(name) + file_name = os.getenv("PADDLE_LAUNCH_LOG") + if file_name is None or file_name == "": + print("can't find PADDLE_LAUNCH_LOG") + sys.exit(1) + with open("{}_{}.log".format(file_name, trainer_id), "w") as f: + f.write(name) + + +if __name__ == '__main__': + train() diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh index cbe008e48dbb35e09f00019410f7952070ae58e0..7918c56ca936c77e2b2e00550e6cc7a8c94ffabe 100644 --- a/python/paddle/fluid/tests/unittests/test_launch.sh +++ b/python/paddle/fluid/tests/unittests/test_launch.sh @@ -69,3 +69,13 @@ else echo "trainer 1 not terminate as planned" exit -1 fi + +#test for random ports +file_0_0="test_launch_filelock_0_0.log" +file_1_0="test_launch_filelock_1_0.log" +rm -rf $file_0_0 $file_0_1 + +distributed_args="--selected_gpus=0,1 --log_dir=testlog" +export PADDLE_LAUNCH_LOG="test_launch_filelock_0" +CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} find_ports.py +str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"