Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
5ea039b3
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
5ea039b3
编写于
6月 15, 2018
作者:
T
Tao Luo
提交者:
GitHub
6月 15, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #11470 from typhoonzero/fix_unitests
Fix dist ut
上级
916e863f
40c631e5
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
59 addition
and
43 deletion
+59
-43
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+2
-1
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+22
-27
python/paddle/fluid/tests/unittests/test_dist_train.py
python/paddle/fluid/tests/unittests/test_dist_train.py
+23
-6
python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
...n/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+12
-9
未找到文件。
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
5ea039b3
...
@@ -348,7 +348,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -348,7 +348,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
};
};
void
SignalHandler
::
StopAndExit
(
int
signal_num
)
{
void
SignalHandler
::
StopAndExit
(
int
signal_num
)
{
VLOG
(
3
)
<<
"Catch interrupt signal: "
<<
signal_num
<<
", program will exit"
;
// Do not use VLOG here for the device for printing maybe already released.
// exit will release interal allocated resoureces.
exit
(
0
);
exit
(
0
);
}
}
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
5ea039b3
...
@@ -22,9 +22,9 @@ from ..executor import global_scope
...
@@ -22,9 +22,9 @@ from ..executor import global_scope
from
layer_function_generator
import
generate_layer_fn
,
templatedoc
from
layer_function_generator
import
generate_layer_fn
,
templatedoc
__all__
=
[
__all__
=
[
'data'
,
'BlockGuardServ'
,
'ListenAndServ'
,
'Send'
,
'
open_recordio_file
'
,
'data'
,
'BlockGuardServ'
,
'ListenAndServ'
,
'Send'
,
'
Recv
'
,
'open_
files'
,
'read_file'
,
'shuffle'
,
'batch'
,
'double_buffer
'
,
'open_
recordio_file'
,
'open_files'
,
'read_file'
,
'shuffle'
,
'batch
'
,
'random_data_generator'
,
'Preprocessor'
,
'load'
'
double_buffer'
,
'
random_data_generator'
,
'Preprocessor'
,
'load'
]
]
...
@@ -177,18 +177,17 @@ class ListenAndServ(object):
...
@@ -177,18 +177,17 @@ class ListenAndServ(object):
})
})
def
Send
(
endpoints
,
send_vars
,
get_vars
=
Non
e
):
def
Send
(
endpoints
,
send_vars
,
sync
=
Tru
e
):
"""
"""
Send layer
Send variables to the server side, and get vars from server
side when server have finished running server side program.
Args:
Args:
endpoints: comma seperated IP:PORT pairs in the order
endpoints
(str)
: comma seperated IP:PORT pairs in the order
of send_vars to send
of send_vars to send
send_vars
: vars to send
send_vars
(list): variables to send to server
get_vars: vars to get from server after send completes.
sync (bool): whether to wait the request finish
Send variables to the server side, and get vars from server
side when server have finished running server side program.
"""
"""
assert
(
type
(
send_vars
)
==
list
)
assert
(
type
(
send_vars
)
==
list
)
...
@@ -196,40 +195,33 @@ def Send(endpoints, send_vars, get_vars=None):
...
@@ -196,40 +195,33 @@ def Send(endpoints, send_vars, get_vars=None):
endpoints
=
list
(
set
(
epmap
))
endpoints
=
list
(
set
(
epmap
))
helper
=
LayerHelper
(
"Send"
,
**
locals
())
helper
=
LayerHelper
(
"Send"
,
**
locals
())
if
not
get_vars
:
get_vars
=
[]
for
s
in
send_vars
:
v
=
helper
.
create_tmp_variable
(
dtype
=
s
.
dtype
,
stop_gradient
=
True
)
get_vars
.
append
(
v
)
rpc_op_role_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
rpc_op_role_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
helper
.
append_op
(
helper
.
append_op
(
type
=
"send"
,
type
=
"send"
,
inputs
=
{
"X"
:
send_vars
},
inputs
=
{
"X"
:
send_vars
},
outputs
=
{
"Out"
:
get_vars
},
attrs
=
{
attrs
=
{
"endpoints"
:
endpoints
,
"endpoints"
:
endpoints
,
"epmap"
:
epmap
,
"epmap"
:
epmap
,
rpc_op_role_name
:
core
.
op_proto_and_checker_maker
.
OpRole
.
RPC
rpc_op_role_name
:
core
.
op_proto_and_checker_maker
.
OpRole
.
RPC
})
})
if
sync
:
return
get_vars
helper
.
append_op
(
type
=
"send_barrier"
,
attrs
=
{
"endpoints"
:
endpoints
})
def
Recv
(
endpoints
,
get_vars
):
def
Recv
(
endpoints
,
get_vars
,
sync
=
True
):
"""
"""
Rec
v layer
Rec
eive variables from server side
Args:
Args:
endpoints: comma seperated IP:PORT pairs in the order
endpoints
(str)
: comma seperated IP:PORT pairs in the order
of send_vars to send
of send_vars to send
send_vars: vars to send
get_vars (list): vars to get from server after send completes.
get_vars: vars to get from server after send completes.
sync (bool): whether to wait the request finish
Send variables to the server side, and get vars from server
Returns:
side when server have finished running server side program.
list: list of received variables
"""
"""
assert
(
type
(
send_vars
)
==
list
)
assert
(
type
(
get_vars
)
==
list
)
assert
(
type
(
get_vars
)
==
list
)
epmap
=
endpoints
.
split
(
","
)
epmap
=
endpoints
.
split
(
","
)
...
@@ -242,6 +234,9 @@ def Recv(endpoints, get_vars):
...
@@ -242,6 +234,9 @@ def Recv(endpoints, get_vars):
outputs
=
{
"Out"
:
get_vars
},
outputs
=
{
"Out"
:
get_vars
},
attrs
=
{
"endpoints"
:
endpoints
,
attrs
=
{
"endpoints"
:
endpoints
,
"epmap"
:
epmap
})
"epmap"
:
epmap
})
if
sync
:
helper
.
append_op
(
type
=
"fetch_barrier"
,
attrs
=
{
"endpoints"
:
endpoints
})
return
get_vars
def
monkey_patch_reader_methods
(
reader
):
def
monkey_patch_reader_methods
(
reader
):
...
...
python/paddle/fluid/tests/unittests/test_dist_train.py
浏览文件 @
5ea039b3
...
@@ -16,6 +16,7 @@ import os
...
@@ -16,6 +16,7 @@ import os
import
time
import
time
import
unittest
import
unittest
from
multiprocessing
import
Process
from
multiprocessing
import
Process
import
signal
import
numpy
import
numpy
...
@@ -24,9 +25,6 @@ import paddle.fluid.layers as layers
...
@@ -24,9 +25,6 @@ import paddle.fluid.layers as layers
class
TestSendOp
(
unittest
.
TestCase
):
class
TestSendOp
(
unittest
.
TestCase
):
@
unittest
.
skip
(
"This test is buggy. We cannot use time.sleep to sync processes, the connection may fail in unittest."
)
def
test_send
(
self
):
def
test_send
(
self
):
# Run init_serv in a thread
# Run init_serv in a thread
place
=
fluid
.
CPUPlace
()
place
=
fluid
.
CPUPlace
()
...
@@ -35,7 +33,9 @@ class TestSendOp(unittest.TestCase):
...
@@ -35,7 +33,9 @@ class TestSendOp(unittest.TestCase):
p
.
daemon
=
True
p
.
daemon
=
True
p
.
start
()
p
.
start
()
time
.
sleep
(
10
)
self
.
ps_timeout
=
5
self
.
_wait_ps_ready
(
p
.
pid
)
with
open
(
"/tmp/paddle.%d.port"
%
p
.
pid
,
"r"
)
as
fn
:
with
open
(
"/tmp/paddle.%d.port"
%
p
.
pid
,
"r"
)
as
fn
:
selected_port
=
int
(
fn
.
readlines
()[
0
])
selected_port
=
int
(
fn
.
readlines
()[
0
])
self
.
init_client
(
place
,
selected_port
)
self
.
init_client
(
place
,
selected_port
)
...
@@ -44,9 +44,23 @@ class TestSendOp(unittest.TestCase):
...
@@ -44,9 +44,23 @@ class TestSendOp(unittest.TestCase):
self
.
assertTrue
(
numpy
.
allclose
(
self
.
local_out
,
self
.
dist_out
))
self
.
assertTrue
(
numpy
.
allclose
(
self
.
local_out
,
self
.
dist_out
))
# FIXME(typhoonzero): find a way to gracefully shutdown the server.
# FIXME(typhoonzero): find a way to gracefully shutdown the server.
os
.
system
(
"kill -9 %d"
%
p
.
pid
)
os
.
kill
(
p
.
pid
,
signal
.
SIGKILL
)
p
.
join
()
p
.
join
()
def
_wait_ps_ready
(
self
,
pid
):
start_left_time
=
self
.
ps_timeout
sleep_time
=
0.5
while
True
:
assert
start_left_time
>=
0
,
"wait ps ready failed"
time
.
sleep
(
sleep_time
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
start_left_time
-=
sleep_time
def
init_serv
(
self
,
place
):
def
init_serv
(
self
,
place
):
main
=
fluid
.
Program
()
main
=
fluid
.
Program
()
...
@@ -84,7 +98,10 @@ class TestSendOp(unittest.TestCase):
...
@@ -84,7 +98,10 @@ class TestSendOp(unittest.TestCase):
dtype
=
"float32"
,
dtype
=
"float32"
,
persistable
=
False
,
persistable
=
False
,
shape
=
[
32
,
32
])
shape
=
[
32
,
32
])
o
=
layers
.
Send
(
"127.0.0.1:%d"
%
port
,
[
x
],
[
get_var
])
fluid
.
initializer
.
Constant
(
value
=
2.3
)(
get_var
,
main
.
global_block
())
layers
.
Send
(
"127.0.0.1:%d"
%
port
,
[
x
])
o
=
layers
.
Recv
(
"127.0.0.1:%d"
%
port
,
[
get_var
])
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
self
.
dist_out
=
exe
.
run
(
main
,
fetch_list
=
o
)
# o is a list
self
.
dist_out
=
exe
.
run
(
main
,
fetch_list
=
o
)
# o is a list
...
...
python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
浏览文件 @
5ea039b3
...
@@ -57,17 +57,18 @@ class TestListenAndServOp(OpTest):
...
@@ -57,17 +57,18 @@ class TestListenAndServOp(OpTest):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
ps_timeout
=
5
self
.
ps_timeout
=
5
self
.
ip
=
"127.0.0.1"
self
.
ip
=
"127.0.0.1"
self
.
port
=
"
6173
"
self
.
port
=
"
0
"
self
.
trainers
=
1
self
.
trainers
=
1
self
.
trainer_id
=
1
self
.
trainer_id
=
0
def
_start_pserver
(
self
,
use_cuda
,
sync_mode
):
def
_start_pserver
(
self
,
use_cuda
,
sync_mode
):
p
=
Process
(
p
=
Process
(
target
=
run_pserver
,
target
=
run_pserver
,
args
=
(
use_cuda
,
sync_mode
,
self
.
ip
,
self
.
port
,
self
.
trainers
,
args
=
(
use_cuda
,
sync_mode
,
self
.
ip
,
self
.
port
,
self
.
trainers
,
self
.
trainer_id
))
self
.
trainer_id
))
p
.
daemon
=
True
p
.
start
()
p
.
start
()
return
p
.
pid
return
p
def
_wait_ps_ready
(
self
,
pid
):
def
_wait_ps_ready
(
self
,
pid
):
start_left_time
=
self
.
ps_timeout
start_left_time
=
self
.
ps_timeout
...
@@ -89,18 +90,20 @@ class TestListenAndServOp(OpTest):
...
@@ -89,18 +90,20 @@ class TestListenAndServOp(OpTest):
def
test_handle_signal_in_serv_op
(
self
):
def
test_handle_signal_in_serv_op
(
self
):
# run pserver on CPU in sync mode
# run pserver on CPU in sync mode
p
id
=
self
.
_start_pserver
(
False
,
True
)
p
1
=
self
.
_start_pserver
(
False
,
True
)
self
.
_wait_ps_ready
(
pid
)
self
.
_wait_ps_ready
(
p
1
.
p
id
)
# raise SIGTERM to pserver
# raise SIGTERM to pserver
os
.
kill
(
pid
,
signal
.
SIGTERM
)
os
.
kill
(
p1
.
pid
,
signal
.
SIGKILL
)
p1
.
join
()
# run pserver on CPU in async mode
# run pserver on CPU in async mode
p
id
=
self
.
_start_pserver
(
False
,
False
)
p
2
=
self
.
_start_pserver
(
False
,
False
)
self
.
_wait_ps_ready
(
pid
)
self
.
_wait_ps_ready
(
p
2
.
p
id
)
# raise SIGTERM to pserver
# raise SIGTERM to pserver
os
.
kill
(
pid
,
signal
.
SIGTERM
)
os
.
kill
(
p2
.
pid
,
signal
.
SIGKILL
)
p2
.
join
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录