Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
524c81e5
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
524c81e5
编写于
6月 04, 2018
作者:
Y
Yancey
提交者:
GitHub
6月 04, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #11126 from Yancey1989/polish_test_listen_and_serv_op
speedup test_listen_and_serv_op
上级
f7a60017
7f5eb9f6
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
35 addition
and
36 deletion
+35
-36
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+4
-2
python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
...n/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+31
-34
未找到文件。
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
524c81e5
...
@@ -48,5 +48,7 @@ foreach(TEST_OP ${TEST_OPS})
...
@@ -48,5 +48,7 @@ foreach(TEST_OP ${TEST_OPS})
endforeach
(
TEST_OP
)
endforeach
(
TEST_OP
)
py_test_modules
(
test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=
${
WARPCTC_LIB_DIR
}
SERIAL
)
py_test_modules
(
test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=
${
WARPCTC_LIB_DIR
}
SERIAL
)
py_test_modules
(
test_dist_train MODULES test_dist_train SERIAL
)
py_test_modules
(
test_dist_train MODULES test_dist_train SERIAL
)
# tests that need to be done in fixed timeout
# FIXME(Yancey1989): this test would cost much more time on CUDAPlace
set_tests_properties
(
test_listen_and_serv_op PROPERTIES TIMEOUT 20
)
# since load cudnn libraries, so we use a longer timeout to make this
# unit test stability.
set_tests_properties
(
test_listen_and_serv_op PROPERTIES TIMEOUT 30
)
python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
浏览文件 @
524c81e5
...
@@ -23,7 +23,7 @@ from multiprocessing import Process
...
@@ -23,7 +23,7 @@ from multiprocessing import Process
from
op_test
import
OpTest
from
op_test
import
OpTest
def
run_pserver
(
use_cuda
,
sync_mode
,
ip
,
port
,
trainer
_count
,
trainer_id
):
def
run_pserver
(
use_cuda
,
sync_mode
,
ip
,
port
,
trainer
s
,
trainer_id
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1
],
dtype
=
'float32'
)
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1
],
dtype
=
'float32'
)
y_predict
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
1
,
act
=
None
)
y_predict
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
1
,
act
=
None
)
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
...
@@ -39,15 +39,8 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id):
...
@@ -39,15 +39,8 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id):
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
port
=
os
.
getenv
(
"PADDLE_INIT_PORT"
,
port
)
pserver_endpoints
=
ip
+
":"
+
port
pserver_ips
=
os
.
getenv
(
"PADDLE_INIT_PSERVERS"
,
ip
)
# ip,ip...
current_endpoint
=
ip
+
":"
+
port
eplist
=
[]
for
ip
in
pserver_ips
.
split
(
","
):
eplist
.
append
(
':'
.
join
([
ip
,
port
]))
pserver_endpoints
=
","
.
join
(
eplist
)
# ip:port,ip:port...
trainers
=
int
(
os
.
getenv
(
"TRAINERS"
,
trainer_count
))
current_endpoint
=
os
.
getenv
(
"POD_IP"
,
ip
)
+
":"
+
port
trainer_id
=
int
(
os
.
getenv
(
"PADDLE_INIT_TRAINER_ID"
,
trainer_id
))
t
=
fluid
.
DistributeTranspiler
()
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
t
.
transpile
(
trainer_id
,
trainer_id
,
...
@@ -62,47 +55,51 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id):
...
@@ -62,47 +55,51 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id):
class
TestListenAndServOp
(
OpTest
):
class
TestListenAndServOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
sleep_time
=
5
self
.
ps_timeout
=
5
self
.
ip
=
"127.0.0.1"
self
.
ip
=
"127.0.0.1"
self
.
port
=
"6173"
self
.
port
=
"6173"
self
.
trainer
_count
=
1
self
.
trainer
s
=
1
self
.
trainer_id
=
1
self
.
trainer_id
=
1
def
_raise_signal
(
self
,
parent_pid
,
raised_signal
):
time
.
sleep
(
self
.
sleep_time
)
ps_command
=
subprocess
.
Popen
(
"ps -o pid --ppid %d --noheaders"
%
parent_pid
,
shell
=
True
,
stdout
=
subprocess
.
PIPE
)
ps_output
=
ps_command
.
stdout
.
read
()
retcode
=
ps_command
.
wait
()
assert
retcode
==
0
,
"ps command returned %d"
%
retcode
for
pid_str
in
ps_output
.
split
(
"
\n
"
)[:
-
1
]:
try
:
os
.
kill
(
int
(
pid_str
),
raised_signal
)
except
Exception
:
continue
def
_start_pserver
(
self
,
use_cuda
,
sync_mode
):
def
_start_pserver
(
self
,
use_cuda
,
sync_mode
):
p
=
Process
(
p
=
Process
(
target
=
run_pserver
,
target
=
run_pserver
,
args
=
(
use_cuda
,
sync_mode
,
self
.
ip
,
self
.
port
,
self
.
trainer
_count
,
args
=
(
use_cuda
,
sync_mode
,
self
.
ip
,
self
.
port
,
self
.
trainer
s
,
self
.
trainer_id
))
self
.
trainer_id
))
p
.
start
()
p
.
start
()
return
p
.
pid
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
self
.
ps_timeout
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
0.5
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
retry_times
-=
1
def
test_rpc_interfaces
(
self
):
# TODO(Yancey1989): need to make sure the rpc interface correctly.
pass
def
test_handle_signal_in_serv_op
(
self
):
def
test_handle_signal_in_serv_op
(
self
):
# run pserver on CPU in sync mode
# run pserver on CPU in sync mode
self
.
_start_pserver
(
False
,
True
)
pid
=
self
.
_start_pserver
(
False
,
True
)
self
.
_wait_ps_ready
(
pid
)
# raise SIG
INT
to pserver
# raise SIG
TERM
to pserver
self
.
_raise_signal
(
os
.
getpid
(),
signal
.
SIGINT
)
os
.
kill
(
pid
,
signal
.
SIGTERM
)
# run pserver on CPU in async mode
# run pserver on CPU in async mode
self
.
_start_pserver
(
False
,
False
)
pid
=
self
.
_start_pserver
(
False
,
False
)
self
.
_wait_ps_ready
(
pid
)
# raise SIGTERM to pserver
# raise SIGTERM to pserver
self
.
_raise_signal
(
os
.
getpid
()
,
signal
.
SIGTERM
)
os
.
kill
(
pid
,
signal
.
SIGTERM
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录