Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
0d4ce6ac
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
0d4ce6ac
编写于
8月 07, 2020
作者:
D
danleifeng
提交者:
GitHub
8月 07, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix test_launch and test_fleet_launch bug; test=develop (#26015)
上级
6e7f0bb2
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
27 addition
and
23 deletion
+27
-23
python/paddle/fluid/tests/unittests/multi_process.py
python/paddle/fluid/tests/unittests/multi_process.py
+14
-9
python/paddle/fluid/tests/unittests/test_fleet_launch.sh
python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+8
-9
python/paddle/fluid/tests/unittests/test_launch.sh
python/paddle/fluid/tests/unittests/test_launch.sh
+5
-5
未找到文件。
python/paddle/fluid/tests/unittests/multi_process.py
浏览文件 @
0d4ce6ac
...
@@ -17,7 +17,7 @@ import sys
...
@@ -17,7 +17,7 @@ import sys
import
time
import
time
def
train
():
def
train
(
prefix
):
selected_gpus
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
selected_gpus
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
worker_endpoints_env
=
os
.
getenv
(
"PADDLE_TRAINER_ENDPOINTS"
)
worker_endpoints_env
=
os
.
getenv
(
"PADDLE_TRAINER_ENDPOINTS"
)
...
@@ -29,11 +29,12 @@ def train():
...
@@ -29,11 +29,12 @@ def train():
.
format
(
selected_gpus
,
worker_endpoints
,
trainers_num
,
current_endpoint
,
trainer_id
)
.
format
(
selected_gpus
,
worker_endpoints
,
trainers_num
,
current_endpoint
,
trainer_id
)
print
(
name
)
print
(
name
)
with
open
(
"multi_process.check_{}.log"
.
format
(
trainer_id
),
"w"
)
as
f
:
with
open
(
"multi_process_{}.check_{}.log"
.
format
(
prefix
,
trainer_id
),
"w"
)
as
f
:
f
.
write
(
name
)
f
.
write
(
name
)
def
train_abort
():
def
train_abort
(
prefix
):
selected_gpus
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
selected_gpus
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
))
worker_endpoints_env
=
os
.
getenv
(
"PADDLE_TRAINER_ENDPOINTS"
)
worker_endpoints_env
=
os
.
getenv
(
"PADDLE_TRAINER_ENDPOINTS"
)
...
@@ -49,8 +50,9 @@ def train_abort():
...
@@ -49,8 +50,9 @@ def train_abort():
name
=
"abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"
\
name
=
"abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"
\
.
format
(
selected_gpus
,
worker_endpoints
,
trainers_num
,
current_endpoint
,
trainer_id
)
.
format
(
selected_gpus
,
worker_endpoints
,
trainers_num
,
current_endpoint
,
trainer_id
)
print
(
name
)
print
(
name
)
with
open
(
"multi_process.check_{}.log"
.
format
(
trainer_id
),
with
open
(
"w"
)
as
f
:
"multi_process_{}.check_{}.log"
.
format
(
prefix
,
trainer_id
),
"w"
)
as
f
:
f
.
write
(
name
)
f
.
write
(
name
)
raise
raise
else
:
else
:
...
@@ -60,12 +62,15 @@ def train_abort():
...
@@ -60,12 +62,15 @@ def train_abort():
.
format
(
selected_gpus
,
worker_endpoints
,
trainers_num
,
current_endpoint
,
trainer_id
)
.
format
(
selected_gpus
,
worker_endpoints
,
trainers_num
,
current_endpoint
,
trainer_id
)
print
(
name
)
print
(
name
)
with
open
(
"multi_process.check_{}.log"
.
format
(
trainer_id
),
"w"
)
as
f
:
with
open
(
"multi_process_{}.check_{}.log"
.
format
(
prefix
,
trainer_id
),
"w"
)
as
f
:
f
.
write
(
name
)
f
.
write
(
name
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
if
len
(
sys
.
argv
)
==
2
and
sys
.
argv
[
1
]
==
"abort"
:
if
len
(
sys
.
argv
)
==
3
and
sys
.
argv
[
2
]
==
"abort"
:
train_abort
()
prefix
=
sys
.
argv
[
1
]
train_abort
(
prefix
)
else
:
else
:
train
()
prefix
=
sys
.
argv
[
1
]
train
(
prefix
)
python/paddle/fluid/tests/unittests/test_fleet_launch.sh
浏览文件 @
0d4ce6ac
...
@@ -4,7 +4,6 @@ set -e
...
@@ -4,7 +4,6 @@ set -e
function
test_launch_ps
(){
function
test_launch_ps
(){
fleetrun
--server_num
=
2
--worker_num
=
2 fleet_ps_training.py 2> ut.elog
fleetrun
--server_num
=
2
--worker_num
=
2 fleet_ps_training.py 2> ut.elog
if
grep
-q
"server are killed"
ut.elog
;
then
if
grep
-q
"server are killed"
ut.elog
;
then
echo
"test pserver launch succeed"
echo
"test pserver launch succeed"
else
else
...
@@ -20,7 +19,7 @@ fi
...
@@ -20,7 +19,7 @@ fi
test_launch_ps
test_launch_ps
# use default values
# use default values
fleetrun multi_process.py
fleetrun multi_process.py
fleetrun
# use paddlecloud
# use paddlecloud
echo
"begin test use paddlecloud"
echo
"begin test use paddlecloud"
...
@@ -30,16 +29,16 @@ export POD_IP=127.0.0.1
...
@@ -30,16 +29,16 @@ export POD_IP=127.0.0.1
export
PADDLE_TRAINERS
=
127.0.0.1,127.0.0.2
export
PADDLE_TRAINERS
=
127.0.0.1,127.0.0.2
export
PADDLE_TRAINER_ID
=
0
export
PADDLE_TRAINER_ID
=
0
export
PADDLE_PORT
=
35
01
9
export
PADDLE_PORT
=
35
78
9
export
TRAINER_PORTS_NUM
=
2
export
TRAINER_PORTS_NUM
=
2
distributed_args
=
"--ips=
${
cluster_node_ips
}
--gpus=0,1 --log_dir=testlog"
distributed_args
=
"--ips=
${
cluster_node_ips
}
--gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES
=
0,1 fleetrun
${
distributed_args
}
multi_process.py
CUDA_VISIBLE_DEVICES
=
0,1 fleetrun
${
distributed_args
}
multi_process.py
fleetrun
str1
=
"selected_gpus:0 worker_endpoints:127.0.0.1:35
019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:3501
9 trainer_id:0"
str1
=
"selected_gpus:0 worker_endpoints:127.0.0.1:35
789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:3578
9 trainer_id:0"
str2
=
"selected_gpus:1 worker_endpoints:127.0.0.1:35
019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:3502
0 trainer_id:1"
str2
=
"selected_gpus:1 worker_endpoints:127.0.0.1:35
789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:3579
0 trainer_id:1"
file_0
=
"multi_process.check_0.log"
file_0
=
"multi_process
_fleetrun
.check_0.log"
file_1
=
"multi_process.check_1.log"
file_1
=
"multi_process
_fleetrun
.check_1.log"
echo
"paddlecloud params test"
echo
"paddlecloud params test"
if
grep
-q
"
$str1
"
"
$file_0
"
;
then
if
grep
-q
"
$str1
"
"
$file_0
"
;
then
...
@@ -70,7 +69,7 @@ unset TRAINER_PORTS_NUM
...
@@ -70,7 +69,7 @@ unset TRAINER_PORTS_NUM
echo
""
echo
""
echo
"paddle.distributed.launch async poll process test"
echo
"paddle.distributed.launch async poll process test"
if
!
CUDA_VISIBLE_DEVICES
=
0,1 fleetrun
${
distributed_args
}
multi_process.py abort
;
then
if
!
CUDA_VISIBLE_DEVICES
=
0,1 fleetrun
${
distributed_args
}
multi_process.py
fleetrun
abort
;
then
echo
"train abort as planned"
echo
"train abort as planned"
fi
fi
...
...
python/paddle/fluid/tests/unittests/test_launch.sh
浏览文件 @
0d4ce6ac
...
@@ -3,7 +3,7 @@ set -e
...
@@ -3,7 +3,7 @@ set -e
# use default values
# use default values
# FIXME: random fails on Unknown command lines -c (or -m).
# FIXME: random fails on Unknown command lines -c (or -m).
launch_py
=
${
PADDLE_BINARY_DIR
}
/python/paddle/distributed/launch.py
launch_py
=
${
PADDLE_BINARY_DIR
}
/python/paddle/distributed/launch.py
python
${
launch_py
}
multi_process.py
python
${
launch_py
}
multi_process.py
launch
# use paddlecloud
# use paddlecloud
echo
"begin test use paddlecloud"
echo
"begin test use paddlecloud"
...
@@ -18,12 +18,12 @@ export PADDLE_PORT=35019
...
@@ -18,12 +18,12 @@ export PADDLE_PORT=35019
export
TRAINER_PORTS_NUM
=
2
export
TRAINER_PORTS_NUM
=
2
distributed_args
=
"--use_paddlecloud --cluster_node_ips=
${
cluster_node_ips
}
--node_ip=
${
node_ip
}
--selected_gpus=0,1 --log_dir=testlog"
distributed_args
=
"--use_paddlecloud --cluster_node_ips=
${
cluster_node_ips
}
--node_ip=
${
node_ip
}
--selected_gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES
=
0,1 python
${
launch_py
}
${
distributed_args
}
multi_process.py
CUDA_VISIBLE_DEVICES
=
0,1 python
${
launch_py
}
${
distributed_args
}
multi_process.py
launch
str1
=
"selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
str1
=
"selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
str2
=
"selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
str2
=
"selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
file_0
=
"multi_process.check_0.log"
file_0
=
"multi_process
_launch
.check_0.log"
file_1
=
"multi_process.check_1.log"
file_1
=
"multi_process
_launch
.check_1.log"
echo
"paddlecloud params test"
echo
"paddlecloud params test"
if
grep
-q
"
$str1
"
"
$file_0
"
;
then
if
grep
-q
"
$str1
"
"
$file_0
"
;
then
...
@@ -54,7 +54,7 @@ unset TRAINER_PORTS_NUM
...
@@ -54,7 +54,7 @@ unset TRAINER_PORTS_NUM
echo
""
echo
""
echo
"paddle.distributed.launch async poll process test"
echo
"paddle.distributed.launch async poll process test"
if
!
CUDA_VISIBLE_DEVICES
=
0,1 python
${
launch_py
}
${
distributed_args
}
multi_process.py abort
;
then
if
!
CUDA_VISIBLE_DEVICES
=
0,1 python
${
launch_py
}
${
distributed_args
}
multi_process.py
launch
abort
;
then
echo
"train abort as planned"
echo
"train abort as planned"
fi
fi
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录