Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
4cade607
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
4cade607
编写于
12月 26, 2017
作者:
武
武毅
提交者:
GitHub
12月 26, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #6983 from typhoonzero/fix_sendrecv_ut
Fix sendrecv ut
上级
49437f1a
d2ded51a
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
55 addition
and
31 deletion
+55
-31
paddle/framework/operator.h
paddle/framework/operator.h
+3
-0
paddle/operators/detail/send_recv_impl.h
paddle/operators/detail/send_recv_impl.h
+2
-0
paddle/operators/recv_op.cc
paddle/operators/recv_op.cc
+19
-6
paddle/operators/send_op.cc
paddle/operators/send_op.cc
+12
-6
paddle/operators/send_recv_op_test.cc
paddle/operators/send_recv_op_test.cc
+16
-18
python/paddle/v2/fluid/distribute_transpiler.py
python/paddle/v2/fluid/distribute_transpiler.py
+3
-1
未找到文件。
paddle/framework/operator.h
浏览文件 @
4cade607
...
...
@@ -89,6 +89,9 @@ class OperatorBase {
/// Net will call this function to Run an op.
virtual
void
Run
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
=
0
;
// FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
virtual
void
Stop
()
{}
virtual
bool
IsNetOp
()
const
{
return
false
;
}
virtual
bool
SupportGPU
()
const
{
return
false
;
}
...
...
paddle/operators/detail/send_recv_impl.h
浏览文件 @
4cade607
...
...
@@ -62,6 +62,8 @@ class SendRecvServerImpl final : public SendRecvService::Service {
const
TensorWithName
Get
()
{
return
this
->
var_recv_queue_
.
Pop
();
}
void
Push
(
const
TensorWithName
&
msg
)
{
this
->
var_recv_queue_
.
Push
(
msg
);
}
private:
// received variable from RPC, operators fetch variable from this queue.
SimpleBlockQueue
<
TensorWithName
>
var_recv_queue_
;
...
...
paddle/operators/recv_op.cc
浏览文件 @
4cade607
...
...
@@ -28,6 +28,8 @@ limitations under the License. */
#include "paddle/operators/detail/send_recv_impl.h"
#include "paddle/operators/detail/simple_block_queue.h"
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
namespace
paddle
{
namespace
operators
{
...
...
@@ -39,7 +41,7 @@ void RunServer(Server **rpc_server,
builder
.
RegisterService
(
service
.
get
());
std
::
unique_ptr
<
Server
>
server
(
builder
.
BuildAndStart
());
*
rpc_server
=
server
.
get
();
LOG
(
INFO
)
<<
"Server listening on "
<<
server_address
<<
std
::
endl
;
LOG
(
INFO
)
<<
"Server listening on "
<<
server_address
;
server
->
Wait
();
}
...
...
@@ -57,7 +59,10 @@ class RecvOp : public framework::OperatorBase {
}
}
virtual
~
RecvOp
()
{
void
Stop
()
override
{
detail
::
TensorWithName
term_msg
;
term_msg
.
first
=
LISTEN_TERMINATE_MESSAGE
;
rpc_service_
->
Push
(
term_msg
);
rpc_server_
->
Shutdown
();
server_thread_
->
join
();
}
...
...
@@ -83,13 +88,18 @@ class RecvOp : public framework::OperatorBase {
size_t
param_count
=
param_list
.
size
();
rpc_service_
->
Reset
();
// TODO(typhoonzero): change this to a while_op for every cluster-batch.
while
(
true
)
{
bool
exit_flag
=
false
;
while
(
!
exit_flag
)
{
// Get from multiple trainers, we don't care about order in which
// the gradient arrives, just add suffix 0~n then average the gradient.
for
(
size_t
i
=
0
;
i
<
param_count
*
trainer_count
;
++
i
)
{
// blocking get one var from client.
const
detail
::
TensorWithName
&
v
=
rpc_service_
->
Get
();
auto
grad_var_name
=
v
.
first
;
if
(
grad_var_name
==
LISTEN_TERMINATE_MESSAGE
)
{
exit_flag
=
true
;
break
;
}
auto
it
=
std
::
find
(
grad_list
.
begin
(),
grad_list
.
end
(),
grad_var_name
);
std
::
string
param_var_name
;
if
(
it
!=
grad_list
.
end
())
{
...
...
@@ -114,8 +124,11 @@ class RecvOp : public framework::OperatorBase {
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
// FIXME(typhoonzero): do not copy
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Get
();
auto
&
dev_ctx
=
*
pool
.
Borrow
(
place
);
framework
::
CopyFrom
(
v
.
second
,
place
,
dev_ctx
,
tensor
);
auto
&
dev_ctx
=
*
pool
.
Borrow
(
dev_place
);
framework
::
CopyFrom
(
v
.
second
,
dev_place
,
dev_ctx
,
tensor
);
}
if
(
exit_flag
)
{
break
;
}
rpc_service_
->
Reset
();
...
...
@@ -123,7 +136,7 @@ class RecvOp : public framework::OperatorBase {
framework
::
proto
::
ProgramDesc
program_desc
;
program_desc
.
ParseFromString
(
program_str
);
framework
::
ProgramDesc
program
(
program_desc
);
framework
::
Executor
executor
(
place
);
framework
::
Executor
executor
(
dev_
place
);
// Run sub graph to get optimized tensor
try
{
executor
.
Run
(
program
,
&
recv_scope
,
0
,
/*global_block*/
...
...
paddle/operators/send_op.cc
浏览文件 @
4cade607
...
...
@@ -41,9 +41,11 @@ class SendOp : public framework::OperatorBase {
grpc
::
CreateChannel
(
ep
,
grpc
::
InsecureChannelCredentials
())));
}
}
void
Run
(
const
framework
::
Scope
&
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
override
{
const
platform
::
Place
&
dev_place
)
const
override
{
auto
ins
=
Inputs
(
"X"
);
auto
outs
=
Outputs
(
"Out"
);
std
::
vector
<
std
::
string
>
epmap
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
);
// TODO(typhoonzero): use async calls to send multiple variable asyncly.
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
++
i
)
{
...
...
@@ -54,10 +56,10 @@ class SendOp : public framework::OperatorBase {
}
// TODO(typhoonzero): support async optimization
client_map_
[
epmap
[
0
]]
->
Wait
();
for
(
size_t
i
=
0
;
i
<
in
s
.
size
();
++
i
)
{
bool
ret
=
client_map_
[
epmap
[
i
]]
->
GetVariable
(
scope
,
in
s
[
i
]);
for
(
size_t
i
=
0
;
i
<
out
s
.
size
();
++
i
)
{
bool
ret
=
client_map_
[
epmap
[
i
]]
->
GetVariable
(
scope
,
out
s
[
i
]);
if
(
!
ret
)
{
LOG
(
ERROR
)
<<
"GetVariable error: "
<<
in
s
[
i
];
LOG
(
ERROR
)
<<
"GetVariable error: "
<<
out
s
[
i
];
}
}
}
...
...
@@ -72,6 +74,8 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
SendOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"(Tensor) Input tensor to be send"
).
AsDuplicable
();
AddOutput
(
"Out"
,
"(Tensor) Output tensor to get from server"
)
.
AsDuplicable
();
AddComment
(
R"DOC(
Recv operator
...
...
@@ -79,11 +83,13 @@ This operator will recv tensor from send_op
)DOC"
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"endpoints"
,
"(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to."
);
"Server endpoints to send variables to."
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
,
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input "
"variables for mapping"
);
"variables for mapping"
)
.
SetDefault
({});
}
};
...
...
paddle/operators/send_recv_op_test.cc
浏览文件 @
4cade607
...
...
@@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// TODO(typhoonzero): add python bindings for this test as
// a RemoteOptimizer.
#include <unistd.h>
#include <string>
#include <thread>
...
...
@@ -86,18 +83,19 @@ void StartServerNet() {
paddle
::
framework
::
ProgramDesc
program
;
paddle
::
framework
::
BlockDesc
*
block
=
program
.
MutableBlock
(
0
);
// X for server side tensors, RX for received tensers, must be of same shape.
AddOp
(
"sum"
,
{{
"X"
,
{
"x0"
,
"x1"
}}},
{{
"Out"
,
{
"
Out
"
}}},
{},
block
);
AddOp
(
"sum"
,
{{
"X"
,
{
"x0"
,
"x1"
}}},
{{
"Out"
,
{
"
x0
"
}}},
{},
block
);
paddle
::
framework
::
AttributeMap
attrs
;
attrs
.
insert
({
"endpoint"
,
std
::
string
(
"127.0.0.1:6174"
)});
attrs
.
insert
({
"ParamList"
,
std
::
vector
<
std
::
string
>
({
"x0"
})});
attrs
.
insert
({
"GradList"
,
std
::
vector
<
std
::
string
>
({
"x1"
})});
std
::
string
program_proto
;
PADDLE_ENFORCE
(
program
.
Proto
()
->
SerializeToString
(
&
program_proto
));
attrs
.
insert
({
"OptimizeProgram"
,
program_proto
});
recv_op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
"recv"
,
{{
"RX"
,
{
"x0"
,
"x1"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
paddle
::
platform
::
CPUDeviceContext
ctx
(
place
);
recv_op
->
Run
(
scope
,
ctx
);
recv_op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
"recv"
,
{{
"RX"
,
{
"x1"
}}},
{},
attrs
);
recv_op
->
Run
(
scope
,
place
);
}
TEST
(
SendRecvOp
,
CPU
)
{
...
...
@@ -109,25 +107,25 @@ TEST(SendRecvOp, CPU) {
InitTensorsInScope
(
scope
,
place
);
paddle
::
framework
::
AttributeMap
attrs
;
attrs
.
insert
({
"endpoint
"
,
std
::
string
(
"127.0.0.1:6174"
)});
attrs
.
insert
({
"endpoint
s"
,
std
::
vector
<
std
::
string
>
({
"127.0.0.1:6174"
}
)});
attrs
.
insert
({
"epmap"
,
std
::
vector
<
std
::
string
>
({
"127.0.0.1:6174"
})});
auto
send_op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
"send"
,
{{
"X"
,
{
"x0"
,
"x1"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
paddle
::
platform
::
CPUDeviceContext
ctx
(
place
);
send_op
->
Run
(
scope
,
ctx
);
"send"
,
{{
"X"
,
{
"x1"
}}},
{{
"Out"
,
{
"x0"
}}},
attrs
);
send_op
->
Run
(
scope
,
place
);
auto
in_var
=
scope
.
Var
(
"x
0
"
);
auto
in_var
=
scope
.
Var
(
"x
1
"
);
auto
tensor
=
in_var
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
();
float
*
expected
=
tensor
->
data
<
float
>
();
auto
out_var
=
scope
.
Var
(
"Out"
);
auto
out_var
=
scope
.
Var
(
"x0"
);
auto
target
=
out_var
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
();
//
send fail cause output is none.
//
x1 * 2 == x0
EXPECT_NE
(
target
->
memory_size
(),
size_t
(
0
));
float
*
actual
=
target
->
data
<
float
>
();
for
(
int64_t
i
=
0
;
i
<
target
->
numel
();
++
i
)
{
EXPECT_EQ
(
expected
[
i
]
*
2
,
actual
[
i
]);
}
recv_op
.
reset
();
// dtor can shutdown and join server thread.
recv_op
->
Stop
();
server_thread
.
join
();
// recv_op.reset();
}
python/paddle/v2/fluid/distribute_transpiler.py
浏览文件 @
4cade607
...
...
@@ -141,16 +141,18 @@ class DistributeTranspiler:
self
.
param_grad_map
=
split_method
(
params_and_grads
,
pserver_endpoints
)
send_op_ordered_inputs
=
[]
send_op_ordered_outputs
=
[]
epmap
=
[]
for
ep
,
v
in
self
.
param_grad_map
.
iteritems
():
send_op_ordered_inputs
.
extend
(
v
[
"grads"
])
send_op_ordered_outputs
.
extend
(
v
[
"params"
])
for
i
in
v
[
"grads"
]:
epmap
.
append
(
ep
)
send_op
=
program
.
global_block
().
append_op
(
type
=
"send"
,
inputs
=
{
"X"
:
send_op_ordered_inputs
},
# inputs is a list of tensors to be send
outputs
=
{},
outputs
=
{
"Out"
:
send_op_ordered_outputs
},
attrs
=
{
"endpoints"
:
pserver_endpoints
,
"epmap"
:
epmap
})
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录