未验证 提交 4b40edf3 编写于 作者: G gongweibao 提交者: GitHub

Use available ports instead of static ports. (#22553)

上级 ad9c8f6d
...@@ -46,6 +46,8 @@ import six ...@@ -46,6 +46,8 @@ import six
import copy import copy
from argparse import ArgumentParser, REMAINDER from argparse import ArgumentParser, REMAINDER
import paddle.fluid as fluid import paddle.fluid as fluid
from contextlib import closing
import socket
logger = logging.getLogger() logger = logging.getLogger()
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
...@@ -63,6 +65,32 @@ def _print_arguments(args): ...@@ -63,6 +65,32 @@ def _print_arguments(args):
print("------------------------------------------------") print("------------------------------------------------")
def find_free_ports(num):
def __free_port():
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
return s.getsockname()[1]
port_set = set()
step = 0
while True:
port = __free_port()
if port not in port_set:
port_set.add(port)
if len(port_set) >= num:
return port_set
step += 1
if step > 100:
print(
"can't find avilable port and use the specified static port now!"
)
return None
return None
def _parse_args(): def _parse_args():
""" """
Helper function parsing the command line options Helper function parsing the command line options
...@@ -101,7 +129,7 @@ POD_IP (current node ip address, not needed for local training) ...@@ -101,7 +129,7 @@ POD_IP (current node ip address, not needed for local training)
parser.add_argument( parser.add_argument(
"--started_port", "--started_port",
type=int, type=int,
default=6170, default=None,
help="The trainer's started port on a single node") help="The trainer's started port on a single node")
parser.add_argument( parser.add_argument(
...@@ -212,12 +240,29 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) ...@@ -212,12 +240,29 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
logger.warning("Use Cloud specified port:{}.".format( logger.warning("Use Cloud specified port:{}.".format(
cloud_paddle_port)) cloud_paddle_port))
free_ports = None
if not args.use_paddlecloud and num_nodes <= 1 and args.started_port is None:
free_ports = find_free_ports(selected_gpus_num)
if free_ports is not None:
free_ports = list(free_ports)
args.started_port = free_ports[0]
if args.started_port is None:
args.started_port = 6170
if free_ports is None:
free_ports = [
x
for x in range(args.started_port, args.started_port +
selected_gpus_num)
]
trainers_endpoints = "" trainers_endpoints = ""
for ip in node_ips: for ip in node_ips:
for i in range(selected_gpus_num): for i in range(0, selected_gpus_num):
if trainers_endpoints != "": if trainers_endpoints != "":
trainers_endpoints += "," trainers_endpoints += ","
trainers_endpoints += "%s:%d" % (ip, args.started_port + i) trainers_endpoints += "%s:%d" % (ip, free_ports[i])
nranks = num_nodes * selected_gpus_num nranks = num_nodes * selected_gpus_num
...@@ -244,7 +289,7 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) ...@@ -244,7 +289,7 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
"FLAGS_selected_gpus": "%s" % selected_gpus[i], "FLAGS_selected_gpus": "%s" % selected_gpus[i],
"PADDLE_TRAINER_ID": "%d" % rank, "PADDLE_TRAINER_ID": "%d" % rank,
"PADDLE_CURRENT_ENDPOINT": "PADDLE_CURRENT_ENDPOINT":
"%s:%d" % (current_node_ip, args.started_port + i), "%s:%d" % (current_node_ip, free_ports[i]),
"PADDLE_TRAINERS_NUM": "%d" % nranks, "PADDLE_TRAINERS_NUM": "%d" % nranks,
"PADDLE_TRAINER_ENDPOINTS": trainers_endpoints "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
}) })
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import time
def train():
selected_gpus = os.getenv("FLAGS_selected_gpus")
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
worker_endpoints = worker_endpoints_env
trainers_num = len(worker_endpoints.split(','))
name = "worker_endpoints:{}" \
.format(worker_endpoints)
print(name)
file_name = os.getenv("PADDLE_LAUNCH_LOG")
if file_name is None or file_name == "":
print("can't find PADDLE_LAUNCH_LOG")
sys.exit(1)
with open("{}_{}.log".format(file_name, trainer_id), "w") as f:
f.write(name)
if __name__ == '__main__':
train()
...@@ -69,3 +69,13 @@ else ...@@ -69,3 +69,13 @@ else
echo "trainer 1 not terminate as planned" echo "trainer 1 not terminate as planned"
exit -1 exit -1
fi fi
#test for random ports
file_0_0="test_launch_filelock_0_0.log"
file_1_0="test_launch_filelock_1_0.log"
rm -rf $file_0_0 $file_0_1
distributed_args="--selected_gpus=0,1 --log_dir=testlog"
export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} find_ports.py
str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册