Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
ed55f1b9
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ed55f1b9
编写于
1月 05, 2018
作者:
T
typhoonzero
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
transpiler_split_tensor
上级
59116442
变更
1
显示空白变更内容
内联
并排
Showing
1 changed file
with
148 addition
and
93 deletion
+148
-93
python/paddle/v2/fluid/distribute_transpiler.py
python/paddle/v2/fluid/distribute_transpiler.py
+148
-93
未找到文件。
python/paddle/v2/fluid/distribute_transpiler.py
浏览文件 @
ed55f1b9
from
__future__
import
print_function
import
framework
import
framework
from
framework
import
Program
,
default_main_program
,
Parameter
,
Variable
from
framework
import
Program
,
default_main_program
,
Parameter
,
Variable
import
optimizer
import
optimizer
from
layer_helper
import
LayerHelper
from
layer_helper
import
LayerHelper
from
distributed_spliter
import
*
def
hash_name_to_server
(
params_grads
,
pserver_endpoints
):
class
VarBlock
:
"""
def
__init__
(
self
,
varname
,
offset
,
size
):
:param param_grads:
self
.
varname
=
varname
:return: a map of pserver endpoint ->
# NOTE: real offset is offset * size
params -> [param list]
self
.
offset
=
offset
grads -> [grad list]
self
.
size
=
size
"""
def
_hash_param
(
param_name
,
total
):
return
hash
(
param_name
)
%
total
param_grad_map
=
dict
()
for
param
,
grad
in
params_grads
:
if
param
.
trainable
is
True
and
grad
is
not
None
:
server_id
=
_hash_param
(
param
.
name
,
len
(
pserver_endpoints
))
server_for_param
=
pserver_endpoints
[
server_id
]
if
not
param_grad_map
.
has_key
(
server_for_param
):
param_grad_map
[
server_for_param
]
=
{
"params"
:
[],
"grads"
:
[]}
param_grad_map
[
server_for_param
][
"params"
].
append
(
param
)
param_grad_map
[
server_for_param
][
"grads"
].
append
(
grad
)
return
param_grad_map
def
round_robin
(
params_grads
,
pserver_endpoints
):
assert
(
len
(
params_grads
)
>
len
(
pserver_endpoints
))
param_grad_map
=
dict
()
def
__str__
(
self
):
pserver_idx
=
0
return
"%s:%d:%d"
%
(
self
.
varname
,
self
.
offset
,
self
.
size
)
for
param
,
grad
in
params_grads
:
if
param
.
trainable
is
True
:
server_for_param
=
pserver_endpoints
[
pserver_idx
]
if
not
param_grad_map
.
has_key
(
server_for_param
):
param_grad_map
[
server_for_param
]
=
{
"params"
:
[],
"grads"
:
[]}
param_grad_map
[
server_for_param
][
"params"
].
append
(
param
)
param_grad_map
[
server_for_param
][
"grads"
].
append
(
grad
)
pserver_idx
+=
1
if
pserver_idx
>=
len
(
pserver_endpoints
):
pserver_idx
=
0
return
param_grad_map
class
DistributeTranspiler
:
class
DistributeTranspiler
:
...
@@ -58,7 +27,6 @@ class DistributeTranspiler:
...
@@ -58,7 +27,6 @@ class DistributeTranspiler:
split_method
=
round_robin
):
split_method
=
round_robin
):
"""
"""
Transpile the program to a distributed data-parallelism programs.
Transpile the program to a distributed data-parallelism programs.
The main_program will be transform to use a remote parameter server
The main_program will be transform to use a remote parameter server
to do parameter optimization. And the optimization graph will be put
to do parameter optimization. And the optimization graph will be put
in to a parameter server program.
in to a parameter server program.
...
@@ -66,45 +34,84 @@ class DistributeTranspiler:
...
@@ -66,45 +34,84 @@ class DistributeTranspiler:
Use different methods to split trainable varialbles to different
Use different methods to split trainable varialbles to different
parameter servers.
parameter servers.
Example to run:
exe = fluid.Executor(place)
t = fluid.DistributeTranspiler()
t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
pserver_endpoint = os.getenv("PSERVER")
if pserver_endpoint:
pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
exe.run(fluid.default_startup_program())
exe.run(pserver_prog)
else:
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM):
...
:param optimize_ops: op list of optimization, should be the
:param optimize_ops: op list of optimization, should be the
return value of Optimizer.minimize
return value of Optimizer.minimize
:type optimize_ops: list
:type optimize_ops: list
:param program: program to optimize, default default_main_program
:param program: program to optimize, default default_main_program
:param pservers: parameter server endpoints like "m1:6174,m2:6174"
:param pservers: parameter server endpoints like "m1:6174,m2:6174"
:type pservers: string
:type pservers: string
:return: return a list of programs
:return: return a list of programs
"""
"""
assert
(
callable
(
split_method
))
if
program
is
None
:
if
program
is
None
:
program
=
default_main_program
()
program
=
default_main_program
()
self
.
program
=
program
self
.
program
=
program
self
.
trainers
=
trainers
self
.
trainers
=
trainers
self
.
optimize_ops
=
optimize_ops
self
.
optimize_ops
=
optimize_ops
self
.
_optimize_distributed
(
# steps to transpile:
optimize_ops
,
# 1. split variable to multiple blocks, align by product(dim[1:]) (width).
program
,
# 2. modify trainer program add split_op to each Grad.
params_grads
,
# 3. append send_op to trainer.
pservers
=
pservers
,
# 4. append concat_op to trainer to update local weights.
trainers
=
trainers
,
# 5. create new program as parameter server.
split_method
=
split_method
)
# 5. create parameter server program by split_method generated endpoint->VarBlock
# 6. run compile time infershape for parameter server program
if
kwargs
.
has_key
(
"split_method"
):
split_method
=
kwargs
[
"split_method"
]
else
:
split_method
=
round_robin
pserver_endpoints
=
kwargs
[
"pservers"
].
split
(
","
)
grad2param
=
dict
()
for
param
,
grad
in
params_and_grads
:
grad2param
[
grad
.
name
()]
=
param
.
name
()
# step1
param_list
=
[
pg
[
0
]
for
pg
in
params_and_grads
]
grad_list
=
[
pg
[
1
]
for
pg
in
params_and_grads
]
# TODO: add split selected rows support
grad_blocks
=
_split_dense_variable
(
grad_list
,
len
(
pserver_endpoints
))
param_blocks
=
_split_dense_variable
(
param_list
,
len
(
pserver_endpoints
))
ep2gradblock
=
split_method
(
grad_blocks
,
pserver_endpoints
)
# self.param_grad_map
# step2
var2splited
=
self
.
_split_trainer_vars
(
program
,
grad_blocks
)
# step3
send_inputs
=
[]
send_outputs
=
[]
for
_
,
splited
in
var2splited
.
iteritems
():
send_inputs
.
extend
(
splited
)
send_outputs
=
self
.
_create_vars_from_blocklist
(
program
,
param_blocks
)
send_op
=
program
.
global_block
().
append_op
(
type
=
"send"
,
inputs
=
{
"X"
:
send_inputs
},
outputs
=
{
"Out"
:
send_outputs
},
attrs
=
{
"endpoints"
:
pserver_endpoints
,
"epmap"
:
epmap
})
def
_create_vars_from_blocklist
(
self
,
program
,
block_list
):
block_map
=
dict
()
ret_vars
=
[]
for
block_str
in
block_list
:
varname
,
offset
,
size
=
block_str
.
split
(
":"
)
if
not
block_map
.
has_key
(
varname
):
block_map
[
varname
]
=
[]
block_map
[
varname
].
append
((
long
(
offset
),
long
(
size
)))
for
varname
,
splited
in
block_map
.
iteritems
():
orig_var
=
program
.
global_block
().
vars
[
varname
]
for
block
in
splited
:
size
=
block
[
1
]
var
=
program
.
global_block
().
create_var
(
name
=
"%s.block%d"
%
(
varname
,
i
),
psersistable
=
False
,
dtype
=
orig_var
.
dtype
,
shape
=
[
1
,
size
])
# flattend splited var
ret_vars
.
append
(
var
)
return
ret_vars
def
_clone_param
(
self
,
block
,
v
):
def
_clone_param
(
self
,
block
,
v
):
assert
isinstance
(
v
,
Parameter
)
assert
isinstance
(
v
,
Parameter
)
...
@@ -131,32 +138,80 @@ class DistributeTranspiler:
...
@@ -131,32 +138,80 @@ class DistributeTranspiler:
lod_level
=
var
.
lod_level
,
lod_level
=
var
.
lod_level
,
persistable
=
var
.
persistable
)
persistable
=
var
.
persistable
)
def
_optimize_distributed
(
self
,
optimize_ops
,
program
,
params_and_grads
,
def
_split_dense_variable
(
self
,
**
kwargs
):
var_list
,
if
kwargs
.
has_key
(
"split_method"
):
pserver_count
,
split_method
=
kwargs
[
"split_method"
]
min_block_size
=
1024
,
else
:
max_block_size
=
1048576
):
split_method
=
round_robin
"""
We may need to split dense tensor to one or several blocks and put
them equally onto parameter server. One block is a sub-tensor
aligned by dim[0] of the tensor.
assert
(
callable
(
split_method
))
We need to have a minimal block size so that the calculations in
pserver_endpoints
=
kwargs
[
"pservers"
].
split
(
","
)
the parameter server side can gain better performance. By default
self
.
param_grad_map
=
split_method
(
params_and_grads
,
pserver_endpoints
)
mininum block size is 1024. The max block size is used to prevent
too large block that may causing send error.
send_op_ordered_inputs
=
[]
"""
send_op_ordered_outputs
=
[]
block_sizes
=
[]
epmap
=
[]
blocks
=
[]
for
ep
,
v
in
self
.
param_grad_map
.
iteritems
():
for
grad
in
var_list
:
send_op_ordered_inputs
.
extend
(
v
[
"grads"
])
dim1
=
reduce
(
lambda
x
,
y
:
x
*
y
,
grad
.
shape
[
1
:])
send_op_ordered_outputs
.
extend
(
v
[
"params"
])
grad_numel
=
reduce
(
lambda
x
,
y
:
x
*
y
,
grad
.
shape
)
for
i
in
v
[
"grads"
]:
if
grad_numel
<
min_block_size
:
epmap
.
append
(
ep
)
block_sizes
.
append
(
grad_numel
)
send_op
=
program
.
global_block
().
append_op
(
block_size
=
grad_numel
/
min_block_size
type
=
"send"
,
if
block_size
<
min_block_size
:
inputs
=
{
"X"
:
send_op_ordered_inputs
block_size
=
min_block_size
},
# inputs is a list of tensors to be send
# align by dim1(width)
outputs
=
{
"Out"
:
send_op_ordered_outputs
},
remains
=
block_size
%
dim1
attrs
=
{
"endpoints"
:
pserver_endpoints
,
if
remains
!=
0
:
"epmap"
:
epmap
})
block_size
+=
dim1
-
remains
block_sizes
.
append
(
block_size
)
num_blocks
=
grad_numel
/
block_size
print
(
"grad numel :%d, blocksize: %d"
%
grad_numel
,
block_size
)
for
block_id
in
xrange
(
num_blocks
):
block
=
VarBlock
(
grad
.
name
(),
block_id
,
block_size
)
blocks
.
append
(
str
(
block
))
return
blocks
def
_split_trainer_vars
(
self
,
program
,
gradblocks
,
params_and_grads
):
var2blocks
=
dict
()
splited
=
dict
()
for
block_str
in
gradblocks
:
varname
,
offset
,
size
=
block_str
.
split
(
":"
)
if
not
var2blocks
.
has_key
(
varname
):
var2blocks
[
varname
]
=
[]
var2blocks
[
varname
].
append
((
long
(
offset
),
long
(
size
)))
for
varname
,
blocks
in
var2blocks
.
iteritems
():
orig_var
=
program
.
global_block
().
vars
[
varname
]
split_outs
=
[]
for
i
in
xrange
(
len
(
blocks
)):
size
=
blocks
[
i
][
1
]
var
=
program
.
global_block
().
create_var
(
name
=
"%s.block%d"
%
(
varname
,
i
),
psersistable
=
False
,
dtype
=
orig_var
.
dtype
,
shape
=
[
1
,
size
])
# flattend splited var
split_outs
.
append
(
var
)
splited
[
varname
]
=
split_outs
program
.
global_block
().
append_op
(
type
=
"split"
,
inputs
=
{
"X"
:
orig_var
},
outputs
=
{
"Out"
:
split_outs
},
attrs
=
{
"num"
:
len
(
blocks
)}
# assume split evenly
)
return
splited
def
_concat_trainer_vars
(
self
,
program
,
splited
):
for
varname
,
to_merge_list
in
splited
.
iteritems
():
orig_var
=
program
.
global_block
().
vars
[
varname
]
program
.
global_block
().
append_op
(
type
=
"concat"
,
inputs
=
{
"X"
:
to_merge_list
},
outputs
=
{
"Out"
:
orig_var
},
attrs
=
{})
def
get_trainer_program
(
self
):
def
get_trainer_program
(
self
):
# remove optimize ops and add a send op to main_program
# remove optimize ops and add a send op to main_program
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录