Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
489b9695
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
489b9695
编写于
12月 11, 2017
作者:
T
typhoonzero
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
wip for testing
上级
308491a9
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
81 addition
and
33 deletion
+81
-33
paddle/operators/detail/recv_impl.cc
paddle/operators/detail/recv_impl.cc
+10
-6
paddle/operators/detail/send_recv.proto
paddle/operators/detail/send_recv.proto
+1
-0
paddle/operators/detail/send_recv_impl.h
paddle/operators/detail/send_recv_impl.h
+8
-8
paddle/operators/recv_op.cc
paddle/operators/recv_op.cc
+39
-8
python/paddle/v2/fluid/executor.py
python/paddle/v2/fluid/executor.py
+21
-10
python/paddle/v2/fluid/tests/book/test_recognize_digits_conv_dist.py
...le/v2/fluid/tests/book/test_recognize_digits_conv_dist.py
+2
-1
未找到文件。
paddle/operators/detail/recv_impl.cc
浏览文件 @
489b9695
...
...
@@ -21,16 +21,20 @@ namespace detail {
Status
SendRecvServerImpl
::
SendVariable
(
ServerContext
*
context
,
const
VariableMessage
*
in_var
,
VariableMessage
*
out_var
)
{
framework
::
LoDTensor
t
;
// TODO(typhoonzero): desirealize in_tensor and run pserver network.
// TODO(typhoonzero): support different variable types.
std
::
istringstream
iss
(
in_var
->
serialized
());
framework
::
LoDTensor
t
;
framework
::
DeserializeFromStream
(
iss
,
&
t
);
lodtensor_queue_
.
Push
(
std
::
move
(
t
));
TensorWithName
tensor_with_name
=
std
::
make_pair
(
in_var
->
varname
(),
std
::
move
(
t
));
var_recv_queue_
.
Push
(
std
::
move
(
tensor_with_name
));
// Block util the sub graph is done.
t
=
lodtenso
r_return_queue_
.
Pop
();
auto
out_tensor_with_name
=
va
r_return_queue_
.
Pop
();
std
::
ostringstream
oss
;
// FIXME(typhoonzero): get context from op.
framework
::
SerializeToStream
(
oss
,
t
,
platform
::
CPUDeviceContext
());
framework
::
SerializeToStream
(
oss
,
out_tensor_with_name
.
second
,
platform
::
CPUDeviceContext
());
std
::
string
*
varname
=
out_var
->
mutable_varname
();
*
varname
=
in_var
->
varname
();
std
::
string
*
serialized
=
out_var
->
mutable_serialized
();
...
...
paddle/operators/detail/send_recv.proto
浏览文件 @
489b9695
...
...
@@ -19,6 +19,7 @@ package sendrecv;
service
SendRecvService
{
// For parameter server round-robin like hashing, do not split tensors.
// Send and recv only one tensor
// TODO(typhoonzero): add streaming API
rpc
SendVariable
(
VariableMessage
)
returns
(
VariableMessage
)
{}
}
...
...
paddle/operators/detail/send_recv_impl.h
浏览文件 @
489b9695
...
...
@@ -48,6 +48,8 @@ namespace paddle {
namespace
operators
{
namespace
detail
{
typedef
std
::
pair
<
std
::
string
,
framework
::
LoDTensor
>
TensorWithName
;
class
SendRecvServerImpl
final
:
public
SendRecvService
::
Service
{
public:
explicit
SendRecvServerImpl
()
{}
...
...
@@ -55,17 +57,15 @@ class SendRecvServerImpl final : public SendRecvService::Service {
Status
SendVariable
(
ServerContext
*
context
,
const
VariableMessage
*
in_var
,
VariableMessage
*
out_var
)
override
;
const
framework
::
LoDTensor
Get
()
{
return
this
->
lodtensor
_queue_
.
Pop
();
}
const
TensorWithName
Get
()
{
return
this
->
var_recv
_queue_
.
Pop
();
}
void
Push
(
const
framework
::
LoDTensor
&
tensor
)
{
this
->
lodtensor_return_queue_
.
Push
(
tensor
);
}
void
Push
(
const
TensorWithName
&
var
)
{
this
->
var_return_queue_
.
Push
(
var
);
}
private:
SimpleBlockQueue
<
framework
::
LoDTensor
>
lodtensor_queue_
;
SimpleBlockQueue
<
framework
::
LoDTensor
>
lodtensor_return
_queue_
;
SimpleBlockQueue
<
framework
::
SelectedRows
>
selected_rows_queue_
;
SimpleBlockQueue
<
framework
::
SelectedRows
>
selected_rows
_return_queue_
;
// received variable from RPC, operators fetch variable from this queue.
SimpleBlockQueue
<
TensorWithName
>
var_recv
_queue_
;
// calculated variable should push to this queue.
SimpleBlockQueue
<
TensorWithName
>
var
_return_queue_
;
};
// RPCClient is a class to send tensors to pserver sub-network
...
...
paddle/operators/recv_op.cc
浏览文件 @
489b9695
...
...
@@ -14,6 +14,7 @@
#include <stdint.h>
#include <sys/stat.h>
#include <iostream>
#include <ostream>
#include <thread>
...
...
@@ -63,14 +64,32 @@ class RecvOp : public framework::OperatorBase {
void
Run
(
const
framework
::
Scope
&
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
override
{
// blocking get one var from client.
const
framework
::
LoDTensor
&
t
=
rpc_service_
->
Get
();
framework
::
Scope
&
recv_scope
=
scope
.
NewScope
();
// blocking get one var from client.
const
detail
::
TensorWithName
&
v
=
rpc_service_
->
Get
();
auto
grad_var_name
=
v
.
first
;
// framework::Scope &recv_scope = scope.NewScope();
auto
param_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"ParamList"
);
auto
grad_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"GradList"
);
auto
it
=
std
::
find
(
grad_list
.
begin
(),
grad_list
.
end
(),
grad_var_name
);
std
::
string
param_var_name
;
if
(
it
!=
grad_list
.
end
())
{
param_var_name
=
param_list
[
it
-
grad_list
.
begin
()];
}
// set graph input var
auto
*
var
=
recv_scope
.
Var
(
Input
(
"RX"
));
auto
input_grad
=
Input
(
"RX"
);
// FIXME(typhoonzero): Find the parameter name from input grad name
// rename X -> Param
// rename RX -> Grad
auto
*
var
=
recv_scope
.
FindVar
(
input_grad
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
recv_scope
.
Rename
(
param_var_name
,
"Param"
);
recv_scope
.
Rename
(
"RX"
,
"Grad"
);
// FIXME(typhoonzero): do not copy
framework
::
CopyFrom
(
t
,
dev_ctx
.
GetPlace
(),
dev_ctx
,
tensor
);
framework
::
CopyFrom
(
v
.
second
,
dev_ctx
.
GetPlace
(),
dev_ctx
,
tensor
);
std
::
string
program_str
=
Attr
<
std
::
string
>
(
"OptimizeProgram"
);
framework
::
ProgramDesc
program_desc
;
...
...
@@ -81,9 +100,14 @@ class RecvOp : public framework::OperatorBase {
executor
.
Run
(
program
,
&
recv_scope
,
0
,
/*global_block*/
false
/*create_local_scope*/
);
auto
*
out_var
=
recv_scope
.
FindVar
(
"Out"
);
// push back
rpc_service_
->
Push
(
out_var
->
Get
<
framework
::
LoDTensor
>
());
auto
*
out_var
=
recv_scope
.
FindVar
(
"Param"
);
detail
::
TensorWithName
out
;
out
.
first
=
param_var_name
;
out
.
second
=
out_var
->
Get
<
framework
::
LoDTensor
>
();
rpc_service_
->
Push
(
out
);
// rename back the params
recv_scope
.
Rename
(
"Param"
,
param_var_name
);
recv_scope
.
Rename
(
"Grad"
,
"RX"
);
}
protected:
...
...
@@ -93,13 +117,14 @@ class RecvOp : public framework::OperatorBase {
// grpc send/recv service implement to register.
std
::
shared_ptr
<
detail
::
SendRecvServerImpl
>
rpc_service_
;
std
::
shared_ptr
<
std
::
thread
>
server_thread_
;
framework
::
Scope
const
*
recv_scope_
{
nullptr
};
};
class
RecvOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
RecvOpMaker
(
framework
::
OpProto
*
proto
,
framework
::
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"RX"
,
"(Tensor) Input tensor to be
saved"
);
AddInput
(
"RX"
,
"(Tensor) Input tensor to be
optimized"
).
AsDuplicable
(
);
AddComment
(
R"DOC(
Recv operator
...
...
@@ -112,6 +137,12 @@ This operator will recv tensor from send_op
.
AddCustomChecker
([](
const
std
::
string
&
ip
)
{
return
!
ip
.
empty
();
});
AddAttr
<
std
::
string
>
(
"OptimizeProgram"
,
"type string"
,
"Serialized ProgramDesc string for recv to run."
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"ParamList"
,
"type list of string"
,
"grad->param name mapping to find which param to optimize."
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"GradList"
,
"type list of string"
,
"grad->param name mapping to find which param to optimize."
);
}
};
...
...
python/paddle/v2/fluid/executor.py
浏览文件 @
489b9695
import
numpy
as
np
from
.
import
core
from
framework
import
Program
,
default_main_program
from
framework
import
Program
,
default_main_program
,
Parameter
,
Variable
import
distribute_planner
__all__
=
[
'Executor'
,
'g_scope'
]
...
...
@@ -91,7 +91,7 @@ class Executor(object):
# FIXME(typhoonzero): send to different servers can run in parrallel.
send_op
=
program
.
global_block
().
append_op
(
type
=
"send"
,
inputs
=
{
"X"
:
self
.
param_grad_map
[
ep
][
"
param
s"
]
inputs
=
{
"X"
:
self
.
param_grad_map
[
ep
][
"
grad
s"
]
},
# inputs is a list of tensors to be send
outputs
=
{},
attrs
=
{
"endpoint"
:
ep
})
...
...
@@ -102,9 +102,20 @@ class Executor(object):
def
get_pserver_program
(
self
,
endpoint
):
pserver_program
=
Program
()
for
param
in
self
.
param_grad_map
[
endpoint
][
"params"
]:
pserver_program
.
global_block
().
create_parameter
(
**
param
.
__dict__
)
for
v
in
self
.
param_grad_map
[
endpoint
][
"params"
]:
assert
isinstance
(
v
,
Parameter
)
new_p
=
Parameter
(
block
=
pserver_program
.
global_block
(),
shape
=
v
.
shape
,
dtype
=
v
.
dtype
,
type
=
v
.
type
,
lod_level
=
v
.
lod_level
,
stop_gradient
=
v
.
stop_gradient
,
trainable
=
v
.
trainable
,
optimize_attr
=
v
.
optimize_attr
,
regularizer
=
v
.
regularizer
,
name
=
v
.
name
)
pserver_program
.
global_block
().
vars
[
new_p
.
name
]
=
new_p
pserver_program
.
global_block
().
append_op
(
type
=
"recv"
,
...
...
@@ -112,12 +123,12 @@ class Executor(object):
self
.
param_grad_map
[
endpoint
][
"grads"
]},
# grads to recv
outputs
=
{},
attrs
=
{
"OptimizeProgram"
:
self
.
optimize_sub_program
.
to_string
(),
"endpoint"
:
endpoint
"OptimizeProgram"
:
self
.
optimize_sub_program
.
to_string
(
True
),
"endpoint"
:
endpoint
,
"ParamList"
:
self
.
param_grad_map
[
endpoint
][
"params"
],
"GradList"
:
self
.
param_grad_map
[
endpoint
][
"grads"
]
})
def
get_trainer_program
(
self
):
return
default_main_program
()
return
pserver_program
def
aslodtensor
(
self
,
data
):
def
accumulate
(
data
):
...
...
python/paddle/v2/fluid/tests/book/test_recognize_digits_conv_dist.py
浏览文件 @
489b9695
...
...
@@ -45,7 +45,8 @@ pserver_endpoint = os.getenv("PSERVER")
if
pserver_endpoint
:
pserver_prog
=
exe
.
get_pserver_program
(
pserver_endpoint
)
exe
.
run
(
fluid
.
default_startup_program
())
exe
.
run
(
pserver_prog
)
while
True
:
exe
.
run
(
pserver_prog
)
else
:
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
images
,
label
],
place
=
place
)
exe
.
run
(
fluid
.
default_startup_program
())
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录