未验证 提交 0d4ce6ac 编写于 作者: D danleifeng 提交者: GitHub

fix test_launch and test_fleet_launch bug; test=develop (#26015)

上级 6e7f0bb2
...@@ -17,7 +17,7 @@ import sys ...@@ -17,7 +17,7 @@ import sys
import time import time
def train(): def train(prefix):
selected_gpus = os.getenv("FLAGS_selected_gpus") selected_gpus = os.getenv("FLAGS_selected_gpus")
trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
...@@ -29,11 +29,12 @@ def train(): ...@@ -29,11 +29,12 @@ def train():
.format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id) .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
print(name) print(name)
with open("multi_process.check_{}.log".format(trainer_id), "w") as f: with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
"w") as f:
f.write(name) f.write(name)
def train_abort(): def train_abort(prefix):
selected_gpus = os.getenv("FLAGS_selected_gpus") selected_gpus = os.getenv("FLAGS_selected_gpus")
trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
...@@ -49,7 +50,8 @@ def train_abort(): ...@@ -49,7 +50,8 @@ def train_abort():
name = "abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ name = "abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
.format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id) .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
print(name) print(name)
with open("multi_process.check_{}.log".format(trainer_id), with open(
"multi_process_{}.check_{}.log".format(prefix, trainer_id),
"w") as f: "w") as f:
f.write(name) f.write(name)
raise raise
...@@ -60,12 +62,15 @@ def train_abort(): ...@@ -60,12 +62,15 @@ def train_abort():
.format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id) .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
print(name) print(name)
with open("multi_process.check_{}.log".format(trainer_id), "w") as f: with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
"w") as f:
f.write(name) f.write(name)
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) == 2 and sys.argv[1] == "abort": if len(sys.argv) == 3 and sys.argv[2] == "abort":
train_abort() prefix = sys.argv[1]
train_abort(prefix)
else: else:
train() prefix = sys.argv[1]
train(prefix)
...@@ -4,7 +4,6 @@ set -e ...@@ -4,7 +4,6 @@ set -e
function test_launch_ps(){ function test_launch_ps(){
fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then if grep -q "server are killed" ut.elog; then
echo "test pserver launch succeed" echo "test pserver launch succeed"
else else
...@@ -20,7 +19,7 @@ fi ...@@ -20,7 +19,7 @@ fi
test_launch_ps test_launch_ps
# use default values # use default values
fleetrun multi_process.py fleetrun multi_process.py fleetrun
# use paddlecloud # use paddlecloud
echo "begin test use paddlecloud" echo "begin test use paddlecloud"
...@@ -30,16 +29,16 @@ export POD_IP=127.0.0.1 ...@@ -30,16 +29,16 @@ export POD_IP=127.0.0.1
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0 export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35019 export PADDLE_PORT=35789
export TRAINER_PORTS_NUM=2 export TRAINER_PORTS_NUM=2
distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog" distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0" str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1" str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
file_0="multi_process.check_0.log" file_0="multi_process_fleetrun.check_0.log"
file_1="multi_process.check_1.log" file_1="multi_process_fleetrun.check_1.log"
echo "paddlecloud params test" echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then if grep -q "$str1" "$file_0"; then
...@@ -70,7 +69,7 @@ unset TRAINER_PORTS_NUM ...@@ -70,7 +69,7 @@ unset TRAINER_PORTS_NUM
echo "" echo ""
echo "paddle.distributed.launch async poll process test" echo "paddle.distributed.launch async poll process test"
if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py abort; then if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then
echo "train abort as planned" echo "train abort as planned"
fi fi
......
...@@ -3,7 +3,7 @@ set -e ...@@ -3,7 +3,7 @@ set -e
# use default values # use default values
# FIXME: random fails on Unknown command lines -c (or -m). # FIXME: random fails on Unknown command lines -c (or -m).
launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
python ${launch_py} multi_process.py python ${launch_py} multi_process.py launch
# use paddlecloud # use paddlecloud
echo "begin test use paddlecloud" echo "begin test use paddlecloud"
...@@ -18,12 +18,12 @@ export PADDLE_PORT=35019 ...@@ -18,12 +18,12 @@ export PADDLE_PORT=35019
export TRAINER_PORTS_NUM=2 export TRAINER_PORTS_NUM=2
distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog" distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0" str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1" str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
file_0="multi_process.check_0.log" file_0="multi_process_launch.check_0.log"
file_1="multi_process.check_1.log" file_1="multi_process_launch.check_1.log"
echo "paddlecloud params test" echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then if grep -q "$str1" "$file_0"; then
...@@ -54,7 +54,7 @@ unset TRAINER_PORTS_NUM ...@@ -54,7 +54,7 @@ unset TRAINER_PORTS_NUM
echo "" echo ""
echo "paddle.distributed.launch async poll process test" echo "paddle.distributed.launch async poll process test"
if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py abort; then if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch abort; then
echo "train abort as planned" echo "train abort as planned"
fi fi
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册