Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
5ea039b3
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
5ea039b3
编写于
6月 15, 2018
作者:
T
Tao Luo
提交者:
GitHub
6月 15, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #11470 from typhoonzero/fix_unitests
Fix dist ut
上级
916e863f
40c631e5
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
59 addition
and
43 deletion
+59
-43
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+2
-1
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+22
-27
python/paddle/fluid/tests/unittests/test_dist_train.py
python/paddle/fluid/tests/unittests/test_dist_train.py
+23
-6
python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
...n/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+12
-9
未找到文件。
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
5ea039b3
...
...
@@ -348,7 +348,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
};
void
SignalHandler
::
StopAndExit
(
int
signal_num
)
{
VLOG
(
3
)
<<
"Catch interrupt signal: "
<<
signal_num
<<
", program will exit"
;
// Do not use VLOG here for the device for printing maybe already released.
// exit will release interal allocated resoureces.
exit
(
0
);
}
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
5ea039b3
...
...
@@ -22,9 +22,9 @@ from ..executor import global_scope
from
layer_function_generator
import
generate_layer_fn
,
templatedoc
__all__
=
[
'data'
,
'BlockGuardServ'
,
'ListenAndServ'
,
'Send'
,
'
open_recordio_file
'
,
'open_
files'
,
'read_file'
,
'shuffle'
,
'batch'
,
'double_buffer
'
,
'random_data_generator'
,
'Preprocessor'
,
'load'
'data'
,
'BlockGuardServ'
,
'ListenAndServ'
,
'Send'
,
'
Recv
'
,
'open_
recordio_file'
,
'open_files'
,
'read_file'
,
'shuffle'
,
'batch
'
,
'
double_buffer'
,
'
random_data_generator'
,
'Preprocessor'
,
'load'
]
...
...
@@ -177,18 +177,17 @@ class ListenAndServ(object):
})
def
Send
(
endpoints
,
send_vars
,
get_vars
=
Non
e
):
def
Send
(
endpoints
,
send_vars
,
sync
=
Tru
e
):
"""
Send layer
Send variables to the server side, and get vars from server
side when server have finished running server side program.
Args:
endpoints: comma seperated IP:PORT pairs in the order
endpoints
(str)
: comma seperated IP:PORT pairs in the order
of send_vars to send
send_vars
: vars to send
get_vars: vars to get from server after send completes.
send_vars
(list): variables to send to server
sync (bool): whether to wait the request finish
Send variables to the server side, and get vars from server
side when server have finished running server side program.
"""
assert
(
type
(
send_vars
)
==
list
)
...
...
@@ -196,40 +195,33 @@ def Send(endpoints, send_vars, get_vars=None):
endpoints
=
list
(
set
(
epmap
))
helper
=
LayerHelper
(
"Send"
,
**
locals
())
if
not
get_vars
:
get_vars
=
[]
for
s
in
send_vars
:
v
=
helper
.
create_tmp_variable
(
dtype
=
s
.
dtype
,
stop_gradient
=
True
)
get_vars
.
append
(
v
)
rpc_op_role_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
helper
.
append_op
(
type
=
"send"
,
inputs
=
{
"X"
:
send_vars
},
outputs
=
{
"Out"
:
get_vars
},
attrs
=
{
"endpoints"
:
endpoints
,
"epmap"
:
epmap
,
rpc_op_role_name
:
core
.
op_proto_and_checker_maker
.
OpRole
.
RPC
})
return
get_vars
if
sync
:
helper
.
append_op
(
type
=
"send_barrier"
,
attrs
=
{
"endpoints"
:
endpoints
})
def
Recv
(
endpoints
,
get_vars
):
def
Recv
(
endpoints
,
get_vars
,
sync
=
True
):
"""
Rec
v layer
Rec
eive variables from server side
Args:
endpoints: comma seperated IP:PORT pairs in the order
endpoints
(str)
: comma seperated IP:PORT pairs in the order
of send_vars to send
send_vars: vars to send
get_vars: vars to get from server after send completes.
get_vars (list): vars to get from server after send completes.
sync (bool): whether to wait the request finish
Send variables to the server side, and get vars from server
side when server have finished running server side program.
Returns:
list: list of received variables
"""
assert
(
type
(
send_vars
)
==
list
)
assert
(
type
(
get_vars
)
==
list
)
epmap
=
endpoints
.
split
(
","
)
...
...
@@ -242,6 +234,9 @@ def Recv(endpoints, get_vars):
outputs
=
{
"Out"
:
get_vars
},
attrs
=
{
"endpoints"
:
endpoints
,
"epmap"
:
epmap
})
if
sync
:
helper
.
append_op
(
type
=
"fetch_barrier"
,
attrs
=
{
"endpoints"
:
endpoints
})
return
get_vars
def
monkey_patch_reader_methods
(
reader
):
...
...
python/paddle/fluid/tests/unittests/test_dist_train.py
浏览文件 @
5ea039b3
...
...
@@ -16,6 +16,7 @@ import os
import
time
import
unittest
from
multiprocessing
import
Process
import
signal
import
numpy
...
...
@@ -24,9 +25,6 @@ import paddle.fluid.layers as layers
class
TestSendOp
(
unittest
.
TestCase
):
@
unittest
.
skip
(
"This test is buggy. We cannot use time.sleep to sync processes, the connection may fail in unittest."
)
def
test_send
(
self
):
# Run init_serv in a thread
place
=
fluid
.
CPUPlace
()
...
...
@@ -35,7 +33,9 @@ class TestSendOp(unittest.TestCase):
p
.
daemon
=
True
p
.
start
()
time
.
sleep
(
10
)
self
.
ps_timeout
=
5
self
.
_wait_ps_ready
(
p
.
pid
)
with
open
(
"/tmp/paddle.%d.port"
%
p
.
pid
,
"r"
)
as
fn
:
selected_port
=
int
(
fn
.
readlines
()[
0
])
self
.
init_client
(
place
,
selected_port
)
...
...
@@ -44,9 +44,23 @@ class TestSendOp(unittest.TestCase):
self
.
assertTrue
(
numpy
.
allclose
(
self
.
local_out
,
self
.
dist_out
))
# FIXME(typhoonzero): find a way to gracefully shutdown the server.
os
.
system
(
"kill -9 %d"
%
p
.
pid
)
os
.
kill
(
p
.
pid
,
signal
.
SIGKILL
)
p
.
join
()
def
_wait_ps_ready
(
self
,
pid
):
start_left_time
=
self
.
ps_timeout
sleep_time
=
0.5
while
True
:
assert
start_left_time
>=
0
,
"wait ps ready failed"
time
.
sleep
(
sleep_time
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
start_left_time
-=
sleep_time
def
init_serv
(
self
,
place
):
main
=
fluid
.
Program
()
...
...
@@ -84,7 +98,10 @@ class TestSendOp(unittest.TestCase):
dtype
=
"float32"
,
persistable
=
False
,
shape
=
[
32
,
32
])
o
=
layers
.
Send
(
"127.0.0.1:%d"
%
port
,
[
x
],
[
get_var
])
fluid
.
initializer
.
Constant
(
value
=
2.3
)(
get_var
,
main
.
global_block
())
layers
.
Send
(
"127.0.0.1:%d"
%
port
,
[
x
])
o
=
layers
.
Recv
(
"127.0.0.1:%d"
%
port
,
[
get_var
])
exe
=
fluid
.
Executor
(
place
)
self
.
dist_out
=
exe
.
run
(
main
,
fetch_list
=
o
)
# o is a list
...
...
python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
浏览文件 @
5ea039b3
...
...
@@ -57,17 +57,18 @@ class TestListenAndServOp(OpTest):
def
setUp
(
self
):
self
.
ps_timeout
=
5
self
.
ip
=
"127.0.0.1"
self
.
port
=
"
6173
"
self
.
port
=
"
0
"
self
.
trainers
=
1
self
.
trainer_id
=
1
self
.
trainer_id
=
0
def
_start_pserver
(
self
,
use_cuda
,
sync_mode
):
p
=
Process
(
target
=
run_pserver
,
args
=
(
use_cuda
,
sync_mode
,
self
.
ip
,
self
.
port
,
self
.
trainers
,
self
.
trainer_id
))
p
.
daemon
=
True
p
.
start
()
return
p
.
pid
return
p
def
_wait_ps_ready
(
self
,
pid
):
start_left_time
=
self
.
ps_timeout
...
...
@@ -89,18 +90,20 @@ class TestListenAndServOp(OpTest):
def
test_handle_signal_in_serv_op
(
self
):
# run pserver on CPU in sync mode
p
id
=
self
.
_start_pserver
(
False
,
True
)
self
.
_wait_ps_ready
(
pid
)
p
1
=
self
.
_start_pserver
(
False
,
True
)
self
.
_wait_ps_ready
(
p
1
.
p
id
)
# raise SIGTERM to pserver
os
.
kill
(
pid
,
signal
.
SIGTERM
)
os
.
kill
(
p1
.
pid
,
signal
.
SIGKILL
)
p1
.
join
()
# run pserver on CPU in async mode
p
id
=
self
.
_start_pserver
(
False
,
False
)
self
.
_wait_ps_ready
(
pid
)
p
2
=
self
.
_start_pserver
(
False
,
False
)
self
.
_wait_ps_ready
(
p
2
.
p
id
)
# raise SIGTERM to pserver
os
.
kill
(
pid
,
signal
.
SIGTERM
)
os
.
kill
(
p2
.
pid
,
signal
.
SIGKILL
)
p2
.
join
()
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录