diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py index 1e99da4615a5fb643b1c534f2ebfc176be0aa07f..b9d46f74fd9d18055886dec6bd70d2be20e37aed 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch.py @@ -202,6 +202,16 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) ] selected_gpus_num = len(selected_gpus) + if args.use_paddlecloud and num_nodes > 1: + cloud_paddle_port = os.getenv("PADDLE_PORT", "") + cloud_paddle_port_num = os.getenv("PADDLE_PORTS_NUM", "") + if cloud_paddle_port != "" and cloud_paddle_port_num != "": + cloud_paddle_port_num = int(cloud_paddle_port_num) + if cloud_paddle_port_num >= selected_gpus_num: + args.started_port = int(cloud_paddle_port) + logger.warning("Use Cloud specified port:{}.".format( + cloud_paddle_port)) + trainers_endpoints = "" for ip in node_ips: for i in range(selected_gpus_num): diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 7460c1e1dcd4012de86673df50be60eb528d2ca6..e7ddc7a7b8343f83c227daa4a6295932c4517e04 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -7,13 +7,13 @@ file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py") string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}") list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist) list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext) +list(APPEND DIST_TEST_OPS test_listen_and_serv_op) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) #remove distribute unittests. list(APPEND MIXED_DIST_TEST_OPS test_dgc_op) list(APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op) list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler) -list(APPEND MIXED_DIST_TEST_OPS test_listen_and_serv_op) list(APPEND MIXED_DIST_TEST_OPS test_nce_remote_table_op) list(APPEND MIXED_DIST_TEST_OPS test_recv_save_op) list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops) @@ -288,7 +288,6 @@ if(WITH_DISTRIBUTE) list(REMOVE_ITEM DIST_TEST_OPS "test_dist_se_resnext_dgc") endif() if(NOT APPLE) - bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh) if(WITH_GPU) # NOTE. test_launch only work in gpu collective mode bash_test_modules(test_launch MODULES test_launch.sh) @@ -297,7 +296,6 @@ if(WITH_DISTRIBUTE) set(dist_ut_port 1000) foreach(TEST_OP ${DIST_TEST_OPS}) - message(STATUS "set dist_ut_port=${dist_ut_port} on ${TEST_OP}") bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}") MATH(EXPR dist_ut_port "${dist_ut_port}+50") endforeach(TEST_OP) diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh index fae744d74326efb740906b2aadb8857fdf7e44d5..cbe008e48dbb35e09f00019410f7952070ae58e0 100644 --- a/python/paddle/fluid/tests/unittests/test_launch.sh +++ b/python/paddle/fluid/tests/unittests/test_launch.sh @@ -11,12 +11,14 @@ export POD_IP=127.0.0.1 export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 export PADDLE_TRAINER_ID=0 -distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} ---selected_gpus=0,1 --log_dir=testlog" +export PADDLE_PORT=35019 +export PADDLE_PORTS_NUM=2 + +distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog" CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py -str1="selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0" -str2="selected_gpus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6171 trainer_id:1" +str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0" +str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1" file_0="multi_process.check_0.log" file_1="multi_process.check_1.log" @@ -43,6 +45,9 @@ if [ -f $file_1 ]; then rm $file_1 fi +unset PADDLE_PORT +unset PADDLE_PORTS_NUM + echo "" echo "paddle.distributed.launch async poll process test" if ! CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py abort; then