Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
b4cd7f3d
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b4cd7f3d
编写于
12月 12, 2017
作者:
T
typhoonzero
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
wip need ut
上级
489b9695
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
87 addition
and
38 deletion
+87
-38
paddle/operators/detail/send_impl.cc
paddle/operators/detail/send_impl.cc
+1
-0
paddle/operators/recv_op.cc
paddle/operators/recv_op.cc
+16
-10
paddle/operators/send_op.cc
paddle/operators/send_op.cc
+1
-0
paddle/pybind/protobuf.cc
paddle/pybind/protobuf.cc
+6
-0
python/paddle/v2/fluid/distribute_planner.py
python/paddle/v2/fluid/distribute_planner.py
+4
-4
python/paddle/v2/fluid/executor.py
python/paddle/v2/fluid/executor.py
+49
-23
python/paddle/v2/fluid/framework.py
python/paddle/v2/fluid/framework.py
+8
-0
python/paddle/v2/fluid/tests/book/test_recognize_digits_conv_dist.py
...le/v2/fluid/tests/book/test_recognize_digits_conv_dist.py
+2
-1
未找到文件。
paddle/operators/detail/send_impl.cc
浏览文件 @
b4cd7f3d
...
@@ -37,6 +37,7 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
...
@@ -37,6 +37,7 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
msg
.
set_serialized
(
oss
.
str
());
msg
.
set_serialized
(
oss
.
str
());
Status
status
=
stub_
->
SendVariable
(
&
context
,
msg
,
&
out_msg
);
Status
status
=
stub_
->
SendVariable
(
&
context
,
msg
,
&
out_msg
);
if
(
!
status
.
ok
())
{
if
(
!
status
.
ok
())
{
LOG
(
ERROR
)
<<
"gRPC error: "
<<
status
.
error_message
();
return
false
;
return
false
;
}
}
std
::
istringstream
iss
(
out_msg
.
serialized
());
std
::
istringstream
iss
(
out_msg
.
serialized
());
...
...
paddle/operators/recv_op.cc
浏览文件 @
b4cd7f3d
...
@@ -64,12 +64,12 @@ class RecvOp : public framework::OperatorBase {
...
@@ -64,12 +64,12 @@ class RecvOp : public framework::OperatorBase {
void
Run
(
const
framework
::
Scope
&
scope
,
void
Run
(
const
framework
::
Scope
&
scope
,
const
platform
::
DeviceContext
&
dev_ctx
)
const
override
{
const
platform
::
DeviceContext
&
dev_ctx
)
const
override
{
// FIXME(typhoonzero): no new scopes for every run.
framework
::
Scope
&
recv_scope
=
scope
.
NewScope
();
framework
::
Scope
&
recv_scope
=
scope
.
NewScope
();
// blocking get one var from client.
// blocking get one var from client.
const
detail
::
TensorWithName
&
v
=
rpc_service_
->
Get
();
const
detail
::
TensorWithName
&
v
=
rpc_service_
->
Get
();
auto
grad_var_name
=
v
.
first
;
auto
grad_var_name
=
v
.
first
;
// framework::Scope &recv_scope = scope.NewScope();
auto
param_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"ParamList"
);
auto
param_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"ParamList"
);
auto
grad_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"GradList"
);
auto
grad_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"GradList"
);
auto
it
=
std
::
find
(
grad_list
.
begin
(),
grad_list
.
end
(),
grad_var_name
);
auto
it
=
std
::
find
(
grad_list
.
begin
(),
grad_list
.
end
(),
grad_var_name
);
...
@@ -77,16 +77,23 @@ class RecvOp : public framework::OperatorBase {
...
@@ -77,16 +77,23 @@ class RecvOp : public framework::OperatorBase {
if
(
it
!=
grad_list
.
end
())
{
if
(
it
!=
grad_list
.
end
())
{
param_var_name
=
param_list
[
it
-
grad_list
.
begin
()];
param_var_name
=
param_list
[
it
-
grad_list
.
begin
()];
}
}
//
set graph input var
//
find input by "grad_var_name"
auto
input_grad
=
Input
(
"RX"
);
// auto inputs = Inputs
("RX");
// FIXME(typhoonzero): Find the parameter name from input grad name
// FIXME(typhoonzero): Find the parameter name from input grad name
// rename X -> Param
// rename X -> Param
// rename RX -> Grad
// rename RX -> Grad
auto
*
var
=
recv_scope
.
FindVar
(
input_grad
);
LOG
(
ERROR
)
<<
"recved grad: "
<<
grad_var_name
<<
" param: "
<<
param_var_name
;
auto
*
var
=
recv_scope
.
Var
(
grad_var_name
);
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
recv_scope
.
Rename
(
param_var_name
,
"Param"
);
recv_scope
.
Rename
(
"RX"
,
"Grad"
);
// Param is in parent scope, put it in current scope.
auto
*
param_var
=
recv_scope
.
FindVar
(
param_var_name
);
auto
param_scope
=
recv_scope
.
FindScope
(
param_var
);
param_scope
->
Rename
(
param_var_name
,
"Param"
);
recv_scope
.
Rename
(
grad_var_name
,
"Grad"
);
// FIXME(typhoonzero): do not copy
// FIXME(typhoonzero): do not copy
framework
::
CopyFrom
(
v
.
second
,
dev_ctx
.
GetPlace
(),
dev_ctx
,
tensor
);
framework
::
CopyFrom
(
v
.
second
,
dev_ctx
.
GetPlace
(),
dev_ctx
,
tensor
);
...
@@ -100,14 +107,14 @@ class RecvOp : public framework::OperatorBase {
...
@@ -100,14 +107,14 @@ class RecvOp : public framework::OperatorBase {
executor
.
Run
(
program
,
&
recv_scope
,
0
,
/*global_block*/
executor
.
Run
(
program
,
&
recv_scope
,
0
,
/*global_block*/
false
/*create_local_scope*/
);
false
/*create_local_scope*/
);
auto
*
out_var
=
recv_scope
.
FindVar
(
"Param"
);
auto
*
out_var
=
recv_scope
.
FindVar
(
"Param
Out
"
);
detail
::
TensorWithName
out
;
detail
::
TensorWithName
out
;
out
.
first
=
param_var_name
;
out
.
first
=
param_var_name
;
out
.
second
=
out_var
->
Get
<
framework
::
LoDTensor
>
();
out
.
second
=
out_var
->
Get
<
framework
::
LoDTensor
>
();
rpc_service_
->
Push
(
out
);
rpc_service_
->
Push
(
out
);
// rename back the params
// rename back the params
recv
_scope
.
Rename
(
"Param"
,
param_var_name
);
param
_scope
.
Rename
(
"Param"
,
param_var_name
);
recv_scope
.
Rename
(
"Grad"
,
"RX"
);
recv_scope
.
Rename
(
"Grad"
,
grad_var_name
);
}
}
protected:
protected:
...
@@ -117,7 +124,6 @@ class RecvOp : public framework::OperatorBase {
...
@@ -117,7 +124,6 @@ class RecvOp : public framework::OperatorBase {
// grpc send/recv service implement to register.
// grpc send/recv service implement to register.
std
::
shared_ptr
<
detail
::
SendRecvServerImpl
>
rpc_service_
;
std
::
shared_ptr
<
detail
::
SendRecvServerImpl
>
rpc_service_
;
std
::
shared_ptr
<
std
::
thread
>
server_thread_
;
std
::
shared_ptr
<
std
::
thread
>
server_thread_
;
framework
::
Scope
const
*
recv_scope_
{
nullptr
};
};
};
class
RecvOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
RecvOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
...
...
paddle/operators/send_op.cc
浏览文件 @
b4cd7f3d
...
@@ -47,6 +47,7 @@ class SendOp : public framework::OperatorBase {
...
@@ -47,6 +47,7 @@ class SendOp : public framework::OperatorBase {
// TODO(typhoonzero): currently it's non-blocking,
// TODO(typhoonzero): currently it's non-blocking,
// should block until server responds.
// should block until server responds.
for
(
auto
in
:
ins
)
{
for
(
auto
in
:
ins
)
{
LOG
(
ERROR
)
<<
"sending grad: "
<<
in
;
bool
ret
=
client_
->
SendVariable
(
scope
,
in
,
in
);
bool
ret
=
client_
->
SendVariable
(
scope
,
in
,
in
);
if
(
!
ret
)
{
if
(
!
ret
)
{
LOG
(
ERROR
)
<<
"send variable error"
;
LOG
(
ERROR
)
<<
"send variable error"
;
...
...
paddle/pybind/protobuf.cc
浏览文件 @
b4cd7f3d
...
@@ -250,6 +250,12 @@ void BindOpDesc(py::module &m) {
...
@@ -250,6 +250,12 @@ void BindOpDesc(py::module &m) {
.
def
(
"set_attr"
,
&
OpDescBind
::
SetAttr
)
.
def
(
"set_attr"
,
&
OpDescBind
::
SetAttr
)
.
def
(
"attr"
,
&
OpDescBind
::
GetAttr
)
.
def
(
"attr"
,
&
OpDescBind
::
GetAttr
)
.
def
(
"set_block_attr"
,
&
OpDescBind
::
SetBlockAttr
)
.
def
(
"set_block_attr"
,
&
OpDescBind
::
SetBlockAttr
)
.
def
(
"set_serialized_attr"
,
[](
OpDescBind
&
self
,
const
std
::
string
&
name
,
const
py
::
bytes
&
seriralized
)
{
std
::
string
ser
(
seriralized
);
self
.
SetAttr
(
name
,
ser
);
})
.
def
(
"block_attr"
,
&
OpDescBind
::
GetBlockAttr
)
.
def
(
"block_attr"
,
&
OpDescBind
::
GetBlockAttr
)
.
def
(
"check_attrs"
,
&
OpDescBind
::
CheckAttrs
)
.
def
(
"check_attrs"
,
&
OpDescBind
::
CheckAttrs
)
.
def
(
"infer_shape"
,
&
OpDescBind
::
InferShape
)
.
def
(
"infer_shape"
,
&
OpDescBind
::
InferShape
)
...
...
python/paddle/v2/fluid/distribute_planner.py
浏览文件 @
b4cd7f3d
...
@@ -29,19 +29,19 @@ def hash_name_to_server(params_grads, pserver_endpoints):
...
@@ -29,19 +29,19 @@ def hash_name_to_server(params_grads, pserver_endpoints):
return
param_grad_map
return
param_grad_map
def
round_robin
(
param
eter
s
,
pserver_endpoints
):
def
round_robin
(
param
s_grad
s
,
pserver_endpoints
):
assert
(
len
(
param
eter
s
)
>
len
(
pserver_endpoints
))
assert
(
len
(
param
s_grad
s
)
>
len
(
pserver_endpoints
))
param_grad_map
=
dict
()
param_grad_map
=
dict
()
pserver_idx
=
0
pserver_idx
=
0
for
param
in
parameter
s
:
for
param
,
grad
in
params_grad
s
:
if
param
.
trainable
is
True
:
if
param
.
trainable
is
True
:
server_for_param
=
pserver_endpoints
[
pserver_idx
]
server_for_param
=
pserver_endpoints
[
pserver_idx
]
if
not
param_grad_map
.
has_key
(
server_for_param
):
if
not
param_grad_map
.
has_key
(
server_for_param
):
param_grad_map
[
server_for_param
]
=
{
"params"
:
[],
"grads"
:
[]}
param_grad_map
[
server_for_param
]
=
{
"params"
:
[],
"grads"
:
[]}
param_grad_map
[
server_for_param
][
"params"
].
append
(
param
)
param_grad_map
[
server_for_param
][
"params"
].
append
(
param
)
param_grad_map
[
server_for_param
][
"grads"
].
append
(
param
)
param_grad_map
[
server_for_param
][
"grads"
].
append
(
grad
)
pserver_idx
+=
1
pserver_idx
+=
1
if
pserver_idx
>=
len
(
pserver_endpoints
):
if
pserver_idx
>=
len
(
pserver_endpoints
):
...
...
python/paddle/v2/fluid/executor.py
浏览文件 @
b4cd7f3d
...
@@ -70,6 +70,31 @@ class Executor(object):
...
@@ -70,6 +70,31 @@ class Executor(object):
return
self
.
_optimize_distributed
(
optimize_ops
,
program
,
return
self
.
_optimize_distributed
(
optimize_ops
,
program
,
params_grads
,
**
kwargs
)
params_grads
,
**
kwargs
)
def
_clone_param
(
self
,
block
,
v
):
assert
isinstance
(
v
,
Parameter
)
new_p
=
Parameter
(
block
=
block
,
shape
=
v
.
shape
,
dtype
=
v
.
dtype
,
type
=
v
.
type
,
lod_level
=
v
.
lod_level
,
stop_gradient
=
v
.
stop_gradient
,
trainable
=
v
.
trainable
,
optimize_attr
=
v
.
optimize_attr
,
regularizer
=
v
.
regularizer
,
name
=
v
.
name
)
block
.
vars
[
new_p
.
name
]
=
new_p
def
_clone_var
(
self
,
block
,
var
):
assert
isinstance
(
var
,
Variable
)
return
block
.
create_var
(
name
=
var
.
name
,
shape
=
var
.
shape
,
dtype
=
var
.
dtype
,
type
=
var
.
type
,
lod_level
=
var
.
lod_level
,
persistable
=
True
)
def
_optimize_distributed
(
self
,
optimize_ops
,
program
,
params_and_grads
,
def
_optimize_distributed
(
self
,
optimize_ops
,
program
,
params_and_grads
,
**
kwargs
):
**
kwargs
):
# remove optimize ops and add a send op to main_program
# remove optimize ops and add a send op to main_program
...
@@ -84,8 +109,7 @@ class Executor(object):
...
@@ -84,8 +109,7 @@ class Executor(object):
assert
(
callable
(
split_method
))
assert
(
callable
(
split_method
))
pserver_endpoints
=
kwargs
[
"pservers"
].
split
(
","
)
pserver_endpoints
=
kwargs
[
"pservers"
].
split
(
","
)
params
=
program
.
global_block
().
all_parameters
()
self
.
param_grad_map
=
split_method
(
params_and_grads
,
pserver_endpoints
)
self
.
param_grad_map
=
split_method
(
params
,
pserver_endpoints
)
for
ep
in
pserver_endpoints
:
for
ep
in
pserver_endpoints
:
# FIXME(typhoonzero): send to different servers can run in parrallel.
# FIXME(typhoonzero): send to different servers can run in parrallel.
...
@@ -95,27 +119,26 @@ class Executor(object):
...
@@ -95,27 +119,26 @@ class Executor(object):
},
# inputs is a list of tensors to be send
},
# inputs is a list of tensors to be send
outputs
=
{},
outputs
=
{},
attrs
=
{
"endpoint"
:
ep
})
attrs
=
{
"endpoint"
:
ep
})
# -------------- generate optimize sub program --------------
self
.
optimize_sub_program
=
Program
()
for
opt_op
in
optimize_ops
:
self
.
optimize_sub_program
.
global_block
().
ops
.
append
(
opt_op
)
def
get_pserver_program
(
self
,
endpoint
):
def
get_pserver_program
(
self
,
endpoint
,
optimize_ops
):
pserver_program
=
Program
()
pserver_program
=
Program
()
for
v
in
self
.
param_grad_map
[
endpoint
][
"params"
]:
for
v
in
self
.
param_grad_map
[
endpoint
][
"params"
]:
assert
isinstance
(
v
,
Parameter
)
self
.
_clone_param
(
pserver_program
.
global_block
(),
v
)
new_p
=
Parameter
(
block
=
pserver_program
.
global_block
(),
optimize_sub_program
=
Program
()
shape
=
v
.
shape
,
for
opt_op
in
optimize_ops
:
dtype
=
v
.
dtype
,
for
varname
,
var
in
opt_op
.
inputs
.
iteritems
():
type
=
v
.
type
,
optimize_sub_program
.
global_block
().
create_var
(
lod_level
=
v
.
lod_level
,
name
=
var
.
name
,
stop_gradient
=
v
.
stop_gradient
,
persistable
=
var
.
persistable
,
trainable
=
v
.
trainable
,
dtype
=
var
.
dtype
,
optimize_attr
=
v
.
optimize_attr
,
shape
=
var
.
shape
)
regularizer
=
v
.
regularizer
,
optimize_sub_program
.
global_block
().
append_op
(
name
=
v
.
name
)
type
=
opt_op
.
type
,
pserver_program
.
global_block
().
vars
[
new_p
.
name
]
=
new_p
inputs
=
opt_op
.
inputs
,
outputs
=
opt_op
.
outputs
,
attrs
=
opt_op
.
attrs
)
print
(
"optimize program: "
,
optimize_sub_program
)
pserver_program
.
global_block
().
append_op
(
pserver_program
.
global_block
().
append_op
(
type
=
"recv"
,
type
=
"recv"
,
...
@@ -123,11 +146,14 @@ class Executor(object):
...
@@ -123,11 +146,14 @@ class Executor(object):
self
.
param_grad_map
[
endpoint
][
"grads"
]},
# grads to recv
self
.
param_grad_map
[
endpoint
][
"grads"
]},
# grads to recv
outputs
=
{},
outputs
=
{},
attrs
=
{
attrs
=
{
"OptimizeProgram"
:
self
.
optimize_sub_program
.
to_string
(
True
)
,
"OptimizeProgram"
:
optimize_sub_program
.
desc
,
"endpoint"
:
endpoint
,
"endpoint"
:
endpoint
,
"ParamList"
:
self
.
param_grad_map
[
endpoint
][
"params"
],
"ParamList"
:
"GradList"
:
self
.
param_grad_map
[
endpoint
][
"grads"
]
[
p
.
name
for
p
in
self
.
param_grad_map
[
endpoint
][
"params"
]],
"GradList"
:
[
p
.
name
for
p
in
self
.
param_grad_map
[
endpoint
][
"grads"
]]
})
})
pserver_program
.
sync_with_cpp
()
return
pserver_program
return
pserver_program
def
aslodtensor
(
self
,
data
):
def
aslodtensor
(
self
,
data
):
...
...
python/paddle/v2/fluid/framework.py
浏览文件 @
b4cd7f3d
...
@@ -227,6 +227,10 @@ class Operator(object):
...
@@ -227,6 +227,10 @@ class Operator(object):
attrs
=
None
):
attrs
=
None
):
self
.
block
=
block
self
.
block
=
block
self
.
desc
=
desc
self
.
desc
=
desc
# for clone a new operator
self
.
inputs
=
inputs
self
.
outputs
=
outputs
self
.
attrs
=
attrs
if
len
(
self
.
desc
.
type
())
!=
0
:
if
len
(
self
.
desc
.
type
())
!=
0
:
return
return
if
type
is
None
:
if
type
is
None
:
...
@@ -298,6 +302,10 @@ class Operator(object):
...
@@ -298,6 +302,10 @@ class Operator(object):
continue
continue
if
isinstance
(
attrs
[
attr_name
],
Block
):
if
isinstance
(
attrs
[
attr_name
],
Block
):
self
.
desc
.
set_block_attr
(
attr_name
,
attrs
[
attr_name
].
desc
)
self
.
desc
.
set_block_attr
(
attr_name
,
attrs
[
attr_name
].
desc
)
elif
isinstance
(
attrs
[
attr_name
],
core
.
BlockDesc
)
or
\
isinstance
(
attrs
[
attr_name
],
core
.
ProgramDesc
):
self
.
desc
.
set_serialized_attr
(
attr_name
,
attrs
[
attr_name
].
serialize_to_string
())
else
:
else
:
self
.
desc
.
set_attr
(
attr_name
,
attrs
[
attr_name
])
self
.
desc
.
set_attr
(
attr_name
,
attrs
[
attr_name
])
...
...
python/paddle/v2/fluid/tests/book/test_recognize_digits_conv_dist.py
浏览文件 @
b4cd7f3d
...
@@ -43,10 +43,11 @@ exe.optimize(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
...
@@ -43,10 +43,11 @@ exe.optimize(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
pserver_endpoint
=
os
.
getenv
(
"PSERVER"
)
pserver_endpoint
=
os
.
getenv
(
"PSERVER"
)
if
pserver_endpoint
:
if
pserver_endpoint
:
pserver_prog
=
exe
.
get_pserver_program
(
pserver_endpoint
)
pserver_prog
=
exe
.
get_pserver_program
(
pserver_endpoint
,
optimize_ops
)
exe
.
run
(
fluid
.
default_startup_program
())
exe
.
run
(
fluid
.
default_startup_program
())
while
True
:
while
True
:
exe
.
run
(
pserver_prog
)
exe
.
run
(
pserver_prog
)
print
(
"Run pserver once end..."
)
else
:
else
:
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
images
,
label
],
place
=
place
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
images
,
label
],
place
=
place
)
exe
.
run
(
fluid
.
default_startup_program
())
exe
.
run
(
fluid
.
default_startup_program
())
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录