Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
71655334
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
71655334
编写于
12月 08, 2017
作者:
T
typhoonzero
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update
上级
dd46d95f
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
80 addition
and
200 deletion
+80
-200
paddle/operators/recv_op.cc
paddle/operators/recv_op.cc
+7
-4
paddle/operators/send_recv_op_test.cc
paddle/operators/send_recv_op_test.cc
+1
-1
python/paddle/v2/fluid/distribute_planner.py
python/paddle/v2/fluid/distribute_planner.py
+22
-148
python/paddle/v2/fluid/executor.py
python/paddle/v2/fluid/executor.py
+23
-29
python/paddle/v2/fluid/tests/book/test_recognize_digits_conv_dist.py
...le/v2/fluid/tests/book/test_recognize_digits_conv_dist.py
+27
-18
未找到文件。
paddle/operators/recv_op.cc
浏览文件 @
71655334
...
...
@@ -72,8 +72,10 @@ class RecvOp : public framework::OperatorBase {
// FIXME(typhoonzero): do not copy
framework
::
CopyFrom
(
t
,
dev_ctx
.
GetPlace
(),
dev_ctx
,
tensor
);
auto
*
block
=
Attr
<
framework
::
BlockDescBind
*>
(
"OptimizeBlock"
);
auto
*
program
=
block
->
Program
();
std
::
string
program_str
=
Attr
<
std
::
string
>
(
"OptimizeProgram"
);
framework
::
Program
program_desc
;
program_desc
.
ParseFromString
(
program_str
);
framework
::
ProgramDescBind
program
(
program_desc
);
framework
::
Executor
executor
(
dev_ctx
);
// Run sub graph to get optimized tensor
executor
.
Run
(
*
program
,
&
recv_scope
,
block
->
ID
(),
...
...
@@ -108,8 +110,9 @@ This operator will recv tensor from send_op
"IP address to listen on."
)
.
SetDefault
(
"127.0.0.1:6164"
)
.
AddCustomChecker
([](
const
std
::
string
&
ip
)
{
return
!
ip
.
empty
();
});
AddAttr
<
framework
::
BlockDescBind
*>
(
"OptimizeBlock"
,
"type BlockDescBind*"
,
"optimize network run in server"
);
AddAttr
<
framework
::
BlockDescBind
*>
(
"OptimizeProgram"
,
"type string"
,
"Serialized ProgramDesc string for recv to run."
);
}
};
...
...
paddle/operators/send_recv_op_test.cc
浏览文件 @
71655334
...
...
@@ -85,7 +85,7 @@ void StartServerNet() {
paddle
::
framework
::
AttributeMap
attrs
;
attrs
.
insert
({
"endpoint"
,
std
::
string
(
"127.0.0.1:6174"
)});
attrs
.
insert
({
"Optimize
Block"
,
block
});
attrs
.
insert
({
"Optimize
Program"
,
program
.
Proto
()
->
SerializeToString
()
});
recv_op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
"recv"
,
{{
"RX"
,
{
"RX"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
paddle
::
platform
::
CPUDeviceContext
ctx
(
place
);
...
...
python/paddle/v2/fluid/distribute_planner.py
浏览文件 @
71655334
...
...
@@ -4,172 +4,46 @@ from regularizer import append_regularization_ops
import
optimizer
from
layer_helper
import
LayerHelper
__all__
=
[
'SGD'
,
'Momentum'
,
'Adagrad'
,
'Adam'
,
'Adamax'
,
'DecayedAdagrad'
]
def
hash_name_to_server
(
params_grads
,
pserver_endpoints
):
"""
:param param_grads:
:return: a map of pserver endpoint ->
params -> [param list]
grads -> [grad list]
"""
def
hash_name_to_server
(
parameters
,
pserver_endpoints
):
def
_hash_param
(
param_name
,
total
):
return
hash
(
param_name
)
%
total
param_map
=
dict
()
for
param
in
parameter
s
:
if
param
.
trainable
is
True
:
param_
grad_
map
=
dict
()
for
param
,
grad
in
params_grad
s
:
if
param
.
trainable
is
True
and
grad
is
not
None
:
server_id
=
_hash_param
(
param
.
name
,
len
(
pserver_endpoints
))
server_for_param
=
pserver_endpoints
[
server_id
]
if
param
_map
.
has_key
(
server_for_param
):
param_
map
[
server_for_param
].
append
(
param
)
else
:
param_map
[
server_for_param
]
=
[
param
]
if
not
param_grad
_map
.
has_key
(
server_for_param
):
param_
grad_map
[
server_for_param
]
=
{
"params"
:
[],
"grads"
:
[]}
param_grad_map
[
server_for_param
][
"params"
].
append
(
param
)
param_grad_map
[
server_for_param
][
"grads"
].
append
(
grad
)
return
param_map
return
param_
grad_
map
def
round_robin
(
parameters
,
pserver_endpoints
):
assert
(
len
(
parameters
)
<
len
(
pserver_endpoints
))
param_map
=
dict
()
param_
grad_
map
=
dict
()
pserver_idx
=
0
for
param
in
parameters
:
if
param
.
trainable
is
True
:
server_for_param
=
pserver_endpoints
[
pserver_idx
]
if
param_map
.
has_key
(
server_for_param
):
param_map
[
server_for_param
].
append
(
param
)
else
:
param_map
[
server_for_param
]
=
[
param
]
if
not
param_grad_map
.
has_key
(
server_for_param
):
param_grad_map
[
server_for_param
]
=
{
"params"
:
[],
"grads"
:
[]}
param_grad_map
[
server_for_param
][
"params"
].
append
(
param
)
param_grad_map
[
server_for_param
][
"grads"
].
append
(
param
)
pserver_idx
+=
1
if
pserver_idx
>
len
(
pserver_endpoints
):
pserver_idx
=
0
return
param_map
def
_append_sendop_for_trainer
(
loss
,
parameters_and_grads
,
pserver_endpoints
,
split_method
=
round_robin
):
assert
(
callable
(
split_method
))
param_map
,
grad_map
=
\
split_method
(
parameters_and_grads
,
pserver_endpoints
)
for
ep
in
pserver_endpoints
:
# FIXME(typhoonzero): send to different servers can run in parrallel.
send_op
=
loss
.
block
.
append_op
(
type
=
"send"
,
inputs
=
{
"X"
:
param_map
[
ep
]},
outputs
=
{
"Out"
:
param_map
[
ep
]},
attrs
=
{
"endpoint"
:
ep
})
return
send_op
class
DistributedPlanner
(
optimizer
.
Optimizer
):
def
__init__
(
self
,
global_step
=
None
,
parallelism_type
=
'dp'
):
"""
parallelism_type:
dp: data parallelism
mp: model parallelism
"""
super
(
DistributedPlanner
).
__init__
(
self
,
global_step
)
if
parallelism_type
==
"mp"
:
raise
NotImplementedError
(
"model parallelism not implemented"
)
elif
parallelism_type
==
"dp"
:
self
.
parameter_server_program_map
=
dict
()
self
.
worker_program
=
None
else
:
raise
NameError
(
"parallelism_type %s not supported"
%
parallelism_type
)
def
create_optimization_pass
(
self
,
parameters_and_grads
,
program
,
startup_program
=
None
):
# Create any accumulators
self
.
helper
=
LayerHelper
(
self
.
__class__
.
__name__
,
main_program
=
program
,
startup_program
=
startup_program
)
self
.
_create_accumulators
(
program
.
global_block
(),
[
p
[
0
]
for
p
in
parameters_and_grads
])
optimize_ops
=
[]
for
param_and_grad
in
parameters_and_grads
:
if
param_and_grad
[
0
].
trainable
is
True
and
param_and_grad
[
1
]
is
not
None
:
optimize_op
=
self
.
_append_optimize_op
(
program
.
global_block
(),
param_and_grad
)
optimize_ops
.
append
(
optimize_op
)
# Returned list of ops can include more ops in addition
# to optimization ops
return_ops
=
optimize_ops
# Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies
finish_ops
=
self
.
_finish_update
(
program
.
global_block
())
if
finish_ops
is
not
None
:
return_ops
+=
finish_ops
if
self
.
_global_step
is
not
None
:
return_ops
.
append
(
self
.
_increment_global_step
(
program
.
global_block
()))
return
return_ops
def
minimize
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
,
split_method
=
round_robin
):
"""
For distributed case, this call append backward ops and then
append sevaral send_ops at the end for each parameter server.
Then call get_pserver_program(idx/endpoint) will return the program of
coresponding pserver program to run.
"""
params_grads
=
append_backward_ops
(
loss
,
parameter_list
,
no_grad_set
)
# Add regularization if any
params_grads
=
append_regularization_ops
(
params_grads
)
_append_sendop_for_trainer
(
loss
,
params_grads
,
self
.
pserver_endpoints
,
split_method
)
self
.
worker_program
=
loss
.
block
.
program
optimize_sub_program
=
framework
.
Program
()
optimize_ops
=
self
.
create_optimization_pass
(
params_grads
,
optimize_sub_program
,
startup_program
)
param_list
=
[]
for
param_and_grad
in
params_grads
:
if
param_and_grad
[
0
].
trainable
is
True
and
param_and_grad
[
1
]
is
not
None
:
param_list
.
append
(
param_and_grad
[
0
])
param_map
,
grad_map
=
\
split_method
(
params_grads
,
self
.
pserver_endpoints
)
for
ep
in
self
.
pserver_endpoints
:
pserver_program
=
framework
.
Program
()
self
.
parameter_server_program_map
[
ep
]
=
pserver_program
pserver_program
.
global_block
().
append_op
(
type
=
"recv"
,
inputs
=
{
"RX"
:
param_map
[
ep
]},
outputs
=
{},
attrs
=
{
"OptimizeBlock"
:
optimize_sub_program
.
global_block
(),
"endpoint"
:
ep
})
# FIXME(typhoonzero): when to use this return value?
return
None
def
get_pserver_program
(
self
,
endpoint
):
return
self
.
parameter_server_program_map
.
get
(
endpoint
)
SGD
=
optimizer
.
SGDOptimizer
Momentum
=
optimizer
.
MomentumOptimizer
Adagrad
=
optimizer
.
AdagradOptimizer
Adam
=
optimizer
.
AdamOptimizer
Adamax
=
optimizer
.
AdamaxOptimizer
DecayedAdagrad
=
optimizer
.
DecayedAdagradOptimizer
for
optcls
in
__all__
:
eval
(
optcls
).
__base__
=
DistributedPlanner
return
param_grad_map
python/paddle/v2/fluid/executor.py
浏览文件 @
71655334
...
...
@@ -69,7 +69,8 @@ class Executor(object):
if
kwargs
.
has_key
(
"pservers"
):
return
self
.
_optimize_distributed
(
optimize_ops
,
program
,
**
kwargs
)
def
_optimize_distributed
(
self
,
optimize_ops
,
program
,
**
kwargs
):
def
_optimize_distributed
(
self
,
optimize_ops
,
program
,
params_and_grads
,
**
kwargs
):
# remove optimize ops and add a send op to main_program
# FIXME(typhoonzero): delete_op only remove the first accurence,
# need to consider about multiple same optimize op?
...
...
@@ -83,43 +84,36 @@ class Executor(object):
assert
(
callable
(
split_method
))
pserver_endpoints
=
kwargs
[
"pservers"
].
split
(
","
)
params
=
program
.
global_block
().
all_parameters
()
param
_map
=
split_method
(
params
,
pserver_endpoints
)
self
.
param_grad
_map
=
split_method
(
params
,
pserver_endpoints
)
for
ep
in
pserver_endpoints
:
# FIXME(typhoonzero): send to different servers can run in parrallel.
send_op
=
program
.
global_block
().
append_op
(
type
=
"send"
,
inputs
=
{
"X"
:
param_map
[
ep
]
inputs
=
{
"X"
:
self
.
param_grad_map
[
ep
][
"params"
]
},
# inputs is a list of tensors to be send
outputs
=
{
"Out"
:
param_map
[
ep
]},
outputs
=
{
"Out"
:
self
.
param_grad_map
[
ep
][
"params"
]},
attrs
=
{
"endpoint"
:
ep
})
# -------------- generate pserver program --------------
self
.
parameter_server_program_map
=
dict
()
optimize_sub_program
=
Program
()
optimize_ops
=
self
.
create_optimization_pass
(
params_grads
,
optimize_sub_program
,
startup_program
)
param_list
=
[]
for
param
in
params
:
if
param
.
trainable
is
True
:
param_list
.
append
(
param
)
param_map
=
split_method
(
params
,
pserver_endpoints
)
for
ep
in
pserver_endpoints
:
pserver_program
=
Program
()
self
.
parameter_server_program_map
[
ep
]
=
pserver_program
pserver_program
.
global_block
().
append_op
(
type
=
"recv"
,
inputs
=
{
"RX"
:
param_map
[
ep
]},
# grads to recv
outputs
=
{},
attrs
=
{
"OptimizeBlock"
:
optimize_sub_program
.
global_block
(),
"endpoint"
:
ep
})
# -------------- generate optimize sub program --------------
self
.
optimize_sub_program
=
Program
()
for
opt_op
in
optimize_ops
:
self
.
optimize_sub_program
.
global_block
().
ops
.
append
(
opt_op
)
def
get_pserver_program
(
self
,
endpoint
):
pass
pserver_program
=
Program
()
for
param
in
self
.
param_grad_map
[
endpoint
][
"params"
]:
pserver_program
.
global_block
().
create_parameter
(
**
param
.
__dict__
)
pserver_program
.
global_block
().
append_op
(
type
=
"recv"
,
inputs
=
{
"RX"
:
self
.
param_grad_map
[
endpoint
][
"grads"
]},
# grads to recv
outputs
=
{},
attrs
=
{
"OptimizeProgram"
:
self
.
optimize_sub_program
.
to_string
(),
"endpoint"
:
endpoint
})
def
get_trainer_program
(
self
):
return
default_main_program
()
...
...
python/paddle/v2/fluid/tests/book/test_recognize_digits_conv_dist.py
浏览文件 @
71655334
...
...
@@ -37,24 +37,33 @@ train_reader = paddle.batch(
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
images
,
label
],
place
=
place
)
exe
.
run
(
fluid
.
default_startup_program
())
for
pass_id
in
range
(
PASS_NUM
):
accuracy
.
reset
(
exe
)
for
data
in
train_reader
():
loss
,
acc
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
avg_cost
]
+
accuracy
.
metrics
)
exe
.
optimize
(
pservers
=
"127.0.0.1:6174"
,
trainers
=
1
)
pserver_endpoint
=
os
.
getenv
(
"PSERVER"
)
if
is_pserver
:
pserver_prog
=
exe
.
get_pserver_program
(
pserver_endpoint
)
exe
.
run
(
fluid
.
default_startup_program
())
exe
.
run
(
pserver_prog
)
else
:
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
images
,
label
],
place
=
place
)
exe
.
run
(
fluid
.
default_startup_program
())
for
pass_id
in
range
(
PASS_NUM
):
accuracy
.
reset
(
exe
)
for
data
in
train_reader
():
loss
,
acc
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
avg_cost
]
+
accuracy
.
metrics
)
pass_acc
=
accuracy
.
eval
(
exe
)
print
(
"pass_id="
+
str
(
pass_id
)
+
" acc="
+
str
(
acc
)
+
" pass_acc="
+
str
(
pass_acc
))
# print loss, acc
if
loss
<
10.0
and
pass_acc
>
0.9
:
# if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
exit
(
0
)
pass_acc
=
accuracy
.
eval
(
exe
)
print
(
"pass_id="
+
str
(
pass_id
)
+
" acc="
+
str
(
acc
)
+
" pass_acc="
+
str
(
pass_acc
))
# print loss, acc
if
loss
<
10.0
and
pass_acc
>
0.9
:
# if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
exit
(
0
)
pass_acc
=
accuracy
.
eval
(
exe
)
print
(
"pass_id="
+
str
(
pass_id
)
+
" pass_acc="
+
str
(
pass_acc
))
print
(
"pass_id="
+
str
(
pass_id
)
+
" pass_acc="
+
str
(
pass_acc
))
exit
(
1
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录