未验证 提交 4b40edf3 编写于 作者: G gongweibao 提交者: GitHub

Use available ports instead of static ports. (#22553)

上级 ad9c8f6d
......@@ -46,6 +46,8 @@ import six
import copy
from argparse import ArgumentParser, REMAINDER
import paddle.fluid as fluid
from contextlib import closing
import socket
logger = logging.getLogger()
logger.setLevel(logging.INFO)
......@@ -63,6 +65,32 @@ def _print_arguments(args):
print("------------------------------------------------")
def find_free_ports(num):
def __free_port():
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
return s.getsockname()[1]
port_set = set()
step = 0
while True:
port = __free_port()
if port not in port_set:
port_set.add(port)
if len(port_set) >= num:
return port_set
step += 1
if step > 100:
print(
"can't find avilable port and use the specified static port now!"
)
return None
return None
def _parse_args():
"""
Helper function parsing the command line options
......@@ -101,7 +129,7 @@ POD_IP (current node ip address, not needed for local training)
parser.add_argument(
"--started_port",
type=int,
default=6170,
default=None,
help="The trainer's started port on a single node")
parser.add_argument(
......@@ -212,12 +240,29 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
logger.warning("Use Cloud specified port:{}.".format(
cloud_paddle_port))
free_ports = None
if not args.use_paddlecloud and num_nodes <= 1 and args.started_port is None:
free_ports = find_free_ports(selected_gpus_num)
if free_ports is not None:
free_ports = list(free_ports)
args.started_port = free_ports[0]
if args.started_port is None:
args.started_port = 6170
if free_ports is None:
free_ports = [
x
for x in range(args.started_port, args.started_port +
selected_gpus_num)
]
trainers_endpoints = ""
for ip in node_ips:
for i in range(selected_gpus_num):
for i in range(0, selected_gpus_num):
if trainers_endpoints != "":
trainers_endpoints += ","
trainers_endpoints += "%s:%d" % (ip, args.started_port + i)
trainers_endpoints += "%s:%d" % (ip, free_ports[i])
nranks = num_nodes * selected_gpus_num
......@@ -244,7 +289,7 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
"FLAGS_selected_gpus": "%s" % selected_gpus[i],
"PADDLE_TRAINER_ID": "%d" % rank,
"PADDLE_CURRENT_ENDPOINT":
"%s:%d" % (current_node_ip, args.started_port + i),
"%s:%d" % (current_node_ip, free_ports[i]),
"PADDLE_TRAINERS_NUM": "%d" % nranks,
"PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
})
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import time
def train():
selected_gpus = os.getenv("FLAGS_selected_gpus")
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
worker_endpoints = worker_endpoints_env
trainers_num = len(worker_endpoints.split(','))
name = "worker_endpoints:{}" \
.format(worker_endpoints)
print(name)
file_name = os.getenv("PADDLE_LAUNCH_LOG")
if file_name is None or file_name == "":
print("can't find PADDLE_LAUNCH_LOG")
sys.exit(1)
with open("{}_{}.log".format(file_name, trainer_id), "w") as f:
f.write(name)
if __name__ == '__main__':
train()
......@@ -69,3 +69,13 @@ else
echo "trainer 1 not terminate as planned"
exit -1
fi
#test for random ports
file_0_0="test_launch_filelock_0_0.log"
file_1_0="test_launch_filelock_1_0.log"
rm -rf $file_0_0 $file_0_1
distributed_args="--selected_gpus=0,1 --log_dir=testlog"
export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} find_ports.py
str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册