提交 ad2bc0c3 编写于 作者: G gongweibao 提交者: Tao Luo

Fix a distribution bug and cleanup some not need logs. (#22381)

上级 7b0692a6
...@@ -202,6 +202,16 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips)) ...@@ -202,6 +202,16 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
] ]
selected_gpus_num = len(selected_gpus) selected_gpus_num = len(selected_gpus)
if args.use_paddlecloud and num_nodes > 1:
cloud_paddle_port = os.getenv("PADDLE_PORT", "")
cloud_paddle_port_num = os.getenv("PADDLE_PORTS_NUM", "")
if cloud_paddle_port != "" and cloud_paddle_port_num != "":
cloud_paddle_port_num = int(cloud_paddle_port_num)
if cloud_paddle_port_num >= selected_gpus_num:
args.started_port = int(cloud_paddle_port)
logger.warning("Use Cloud specified port:{}.".format(
cloud_paddle_port))
trainers_endpoints = "" trainers_endpoints = ""
for ip in node_ips: for ip in node_ips:
for i in range(selected_gpus_num): for i in range(selected_gpus_num):
......
...@@ -7,13 +7,13 @@ file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py") ...@@ -7,13 +7,13 @@ file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}") string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext) list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
list(APPEND DIST_TEST_OPS test_listen_and_serv_op)
set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
#remove distribute unittests. #remove distribute unittests.
list(APPEND MIXED_DIST_TEST_OPS test_dgc_op) list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
list(APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op) list(APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op)
list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer)
list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler) list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
list(APPEND MIXED_DIST_TEST_OPS test_listen_and_serv_op)
list(APPEND MIXED_DIST_TEST_OPS test_nce_remote_table_op) list(APPEND MIXED_DIST_TEST_OPS test_nce_remote_table_op)
list(APPEND MIXED_DIST_TEST_OPS test_recv_save_op) list(APPEND MIXED_DIST_TEST_OPS test_recv_save_op)
list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops) list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops)
...@@ -288,7 +288,6 @@ if(WITH_DISTRIBUTE) ...@@ -288,7 +288,6 @@ if(WITH_DISTRIBUTE)
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_se_resnext_dgc") list(REMOVE_ITEM DIST_TEST_OPS "test_dist_se_resnext_dgc")
endif() endif()
if(NOT APPLE) if(NOT APPLE)
bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh)
if(WITH_GPU) if(WITH_GPU)
# NOTE. test_launch only work in gpu collective mode # NOTE. test_launch only work in gpu collective mode
bash_test_modules(test_launch MODULES test_launch.sh) bash_test_modules(test_launch MODULES test_launch.sh)
...@@ -297,7 +296,6 @@ if(WITH_DISTRIBUTE) ...@@ -297,7 +296,6 @@ if(WITH_DISTRIBUTE)
set(dist_ut_port 1000) set(dist_ut_port 1000)
foreach(TEST_OP ${DIST_TEST_OPS}) foreach(TEST_OP ${DIST_TEST_OPS})
message(STATUS "set dist_ut_port=${dist_ut_port} on ${TEST_OP}")
bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}") bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
MATH(EXPR dist_ut_port "${dist_ut_port}+50") MATH(EXPR dist_ut_port "${dist_ut_port}+50")
endforeach(TEST_OP) endforeach(TEST_OP)
......
...@@ -11,12 +11,14 @@ export POD_IP=127.0.0.1 ...@@ -11,12 +11,14 @@ export POD_IP=127.0.0.1
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0 export PADDLE_TRAINER_ID=0
distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} export PADDLE_PORT=35019
--selected_gpus=0,1 --log_dir=testlog" export PADDLE_PORTS_NUM=2
distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py
str1="selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0" str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6171 trainer_id:1" str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
file_0="multi_process.check_0.log" file_0="multi_process.check_0.log"
file_1="multi_process.check_1.log" file_1="multi_process.check_1.log"
...@@ -43,6 +45,9 @@ if [ -f $file_1 ]; then ...@@ -43,6 +45,9 @@ if [ -f $file_1 ]; then
rm $file_1 rm $file_1
fi fi
unset PADDLE_PORT
unset PADDLE_PORTS_NUM
echo "" echo ""
echo "paddle.distributed.launch async poll process test" echo "paddle.distributed.launch async poll process test"
if ! CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py abort; then if ! CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py abort; then
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册