Fix a distribution bug and cleanup some not need logs. (#22381)

ad2bc0c3 · gongweibao · Tao Luo · 7b0692a6 · ad2bc0c3 · ad2bc0c3
3 changed file
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -202,6 +202,16 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
            ]
    selected_gpus_num = len(selected_gpus)

+    if args.use_paddlecloud and num_nodes > 1:
+        cloud_paddle_port = os.getenv("PADDLE_PORT", "")
+        cloud_paddle_port_num = os.getenv("PADDLE_PORTS_NUM", "")
+        if cloud_paddle_port != "" and cloud_paddle_port_num != "":
+            cloud_paddle_port_num = int(cloud_paddle_port_num)
+            if cloud_paddle_port_num >= selected_gpus_num:
+                args.started_port = int(cloud_paddle_port)
+                logger.warning("Use Cloud specified port:{}.".format(
+                    cloud_paddle_port))
+
    trainers_endpoints = ""
    for ip in node_ips:
        for i in range(selected_gpus_num):

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -7,13 +7,13 @@ file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
 string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
+list(APPEND DIST_TEST_OPS test_listen_and_serv_op)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op)
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
-list(APPEND MIXED_DIST_TEST_OPS test_listen_and_serv_op)
 list(APPEND MIXED_DIST_TEST_OPS test_nce_remote_table_op)
 list(APPEND MIXED_DIST_TEST_OPS test_recv_save_op)
 list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops)
@@ -288,7 +288,6 @@ if(WITH_DISTRIBUTE)
        list(REMOVE_ITEM DIST_TEST_OPS "test_dist_se_resnext_dgc")
    endif()
    if(NOT APPLE)
-        bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh)
        if(WITH_GPU)
            # NOTE. test_launch only work in gpu collective mode
            bash_test_modules(test_launch MODULES test_launch.sh)
@@ -297,7 +296,6 @@ if(WITH_DISTRIBUTE)

        set(dist_ut_port 1000)
        foreach(TEST_OP ${DIST_TEST_OPS})
-            message(STATUS "set dist_ut_port=${dist_ut_port} on ${TEST_OP}")
            bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
            MATH(EXPR dist_ut_port "${dist_ut_port}+50")
        endforeach(TEST_OP)

--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
@@ -11,12 +11,14 @@ export POD_IP=127.0.0.1
 export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
 export PADDLE_TRAINER_ID=0

-distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip}
--selected_gpus=0,1 --log_dir=testlog"
+export PADDLE_PORT=35019
+export PADDLE_PORTS_NUM=2
+
+distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog"
 CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py

-str1="selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
-str2="selected_gpus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6171 trainer_id:1"
+str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
+str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
 file_0="multi_process.check_0.log"
 file_1="multi_process.check_1.log"

@@ -43,6 +45,9 @@ if [ -f $file_1 ]; then
    rm $file_1
 fi

+unset PADDLE_PORT
+unset PADDLE_PORTS_NUM
+
 echo ""
 echo "paddle.distributed.launch async poll process test"
 if ! CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py abort; then