From af0ad915ea15a98514232bd8c9176f22f977282a Mon Sep 17 00:00:00 2001 From: gongweibao Date: Mon, 30 Mar 2020 07:13:36 -0500 Subject: [PATCH] fix testlaunch test=develop (#23304) --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +- python/paddle/fluid/tests/unittests/test_launch.sh | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index d2b02c2768b..c4596471595 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -300,7 +300,7 @@ if(WITH_DISTRIBUTE) if(NOT APPLE) if(WITH_GPU) # NOTE. test_launch only work in gpu collective mode - bash_test_modules(test_launch MODULES test_launch.sh) + bash_test_modules(test_launch MODULES test_launch.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) endif() bash_test_modules(test_launch_ps MODULES test_launch_ps.sh) diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh index 7918c56ca93..5c6b0b50ad8 100644 --- a/python/paddle/fluid/tests/unittests/test_launch.sh +++ b/python/paddle/fluid/tests/unittests/test_launch.sh @@ -1,7 +1,9 @@ #!/bin/bash set -e # use default values -python -m paddle.distributed.launch multi_process.py +# FIXME: random fails on Unknown command lines -c (or -m). +launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py +python ${launch_py} multi_process.py # use paddlecloud cluster_node_ips="10.0.0.1" @@ -15,7 +17,7 @@ export PADDLE_PORT=35019 export PADDLE_PORTS_NUM=2 distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog" -CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py +CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0" str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1" @@ -50,7 +52,7 @@ unset PADDLE_PORTS_NUM echo "" echo "paddle.distributed.launch async poll process test" -if ! CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} multi_process.py abort; then +if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py abort; then echo "train abort as planned" fi @@ -77,5 +79,5 @@ rm -rf $file_0_0 $file_0_1 distributed_args="--selected_gpus=0,1 --log_dir=testlog" export PADDLE_LAUNCH_LOG="test_launch_filelock_0" -CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch ${distributed_args} find_ports.py +CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} find_ports.py str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071" -- GitLab