From 34eb27a1d6c0412aae014d917f0dde2292d730ed Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Tue, 11 Aug 2020 14:11:44 +0800 Subject: [PATCH] ps worker-ports are optional for users for fleetrun command; test=develop (#26090) --- python/paddle/fleet/launch.py | 37 ++++++++++++++----- .../tests/unittests/test_fleet_launch.sh | 10 ++++- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/python/paddle/fleet/launch.py b/python/paddle/fleet/launch.py index 8b08c916c84..a6f71f2ae94 100644 --- a/python/paddle/fleet/launch.py +++ b/python/paddle/fleet/launch.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -paddle.distributed.launch is a module that spawns multiple distributed +fleetrun is a module that spawns multiple distributed process on each training node for gpu training and cpu training. Usage: In both of single node training or multiple node training, this module @@ -31,16 +31,26 @@ launch a process on each of the given gpu card or cpu machine. your_training_py (arg1 arg2 and all others) CPU training: 1. for single node training with multi servers and workers: - fleetrun --server_num=1 --worker_num=4 your_training_py (arg1 arg2 and all others) + fleetrun --server_num=2 --worker_num=2 your_training_py (arg1 arg2 and all others) 2. for multiple node training such as two node:192.168.0.16, 192.168.0.17 \ - with 2 servers and 4 workers. + with 2 servers and 4 workers. on 192.168.0.16: - fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \ - --workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \ + fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \ + --workers="192.168.0.16,192.168.0.17,192.168.0.16,192.168.0.17" \ your_training_py (arg1 arg2 and all others) on 192.168.0.17: fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \ - --workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \ + --workers="192.168.0.16,192.168.0.17,192.168.0.16,192.168.0.17" \ + your_training_py (arg1 arg2 and all others) + 3. use gloo backend for multiple node training such as two node:192.168.0.16, 192.168.0.17 \ + with 2 servers and 4 workers. (workers should set port) + on 192.168.0.16: + fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \ + --workers="192.168.0.16:6171,192.168.0.17:6171,192.168.0.16:6172,192.168.0.17:6172" \ + your_training_py (arg1 arg2 and all others) + on 192.168.0.17: + fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \ + --workers="192.168.0.16:6171,192.168.0.17:6171,192.168.0.16:6172,192.168.0.17:6172" \ your_training_py (arg1 arg2 and all others) """ @@ -215,6 +225,7 @@ def launch_collective(args): def launch_ps(args): ports = None + start_port = 6170 if args.server_num: server_num = args.server_num ports = get_ports(server_num, 0) @@ -240,11 +251,19 @@ def launch_ps(args): worker_endpoints_ips = [ x.strip().split(":")[0] for x in worker_endpoints.split(",") ] - worker_endpoints_port = [ - x.strip().split(":")[1] for x in worker_endpoints.split(",") - ] worker_num = len(worker_endpoints_ips) node_ips = list(set(server_endpoints_ips + worker_endpoints_ips)) + worker_endpoints_len = [ + len(x.strip().split(":")) for x in worker_endpoints.split(",") + ] + if 1 in worker_endpoints_len: + # if no port value in worker_endpoints, will set default port values. + worker_endpoints_port = range(start_port + server_num, + start_port + server_num + worker_num, 1) + else: + worker_endpoints_port = [ + x.strip().split(":")[1] for x in worker_endpoints.split(",") + ] # local train if len(set(node_ips)) == 1: diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh index ebe99ffb635..c5edc969634 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh @@ -11,7 +11,15 @@ function test_launch_ps(){ exit -1 fi - fleetrun --servers="120.0.0.1:6780,120.0.0.1:6781" --workers="120.0.0.1:6782,120.0.0.1:6783" fleet_ps_training.py 2> ut.elog + fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1:6782,127.0.0.1:6783" fleet_ps_training.py 2> ut.elog + if grep -q "server are killed" ut.elog; then + echo "test pserver launch succeed" + else + echo "test pserver launch failed" + exit -1 + fi + + fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1,127.0.0.1" fleet_ps_training.py 2> ut.elog if grep -q "server are killed" ut.elog; then echo "test pserver launch succeed" else -- GitLab