Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
2f382640
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
You need to sign in or sign up before continuing.
未验证
提交
2f382640
编写于
4月 14, 2018
作者:
X
Xin Pan
提交者:
GitHub
4月 14, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #9905 from panyx0718/mem-opt
Polish memory optimization transpiler
上级
b48cf171
d4024a6e
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
88 addition
and
47 deletion
+88
-47
python/paddle/fluid/memory_optimization_transpiler.py
python/paddle/fluid/memory_optimization_transpiler.py
+88
-47
未找到文件。
python/paddle/fluid/memory_optimization_transpiler.py
浏览文件 @
2f382640
...
@@ -29,17 +29,20 @@ dtype_to_size = {
...
@@ -29,17 +29,20 @@ dtype_to_size = {
core
.
VarDesc
.
VarType
.
BOOL
:
1
core
.
VarDesc
.
VarType
.
BOOL
:
1
}
}
sub_block_ops
=
[
SUB_BLOCK_OPS
=
[
"while"
,
"while_grad"
,
"parallel_do"
,
"parallel_do_grad"
,
"while"
,
"while_grad"
,
"parallel_do"
,
"parallel_do_grad"
,
"conditional_block"
,
"conditional_block_grad"
"conditional_block"
,
"conditional_block_grad"
]
]
SUB_BLOCK_PAIR
=
[(
"while"
,
"while_grad"
),
(
"parallel_do"
,
"parallel_do_grad"
),
(
"conditional_block"
,
"conditional_block_grad"
)]
PRINT_LOG
=
False
PRINT_LOG
=
False
class
ControlFlowGraph
(
object
):
class
ControlFlowGraph
(
object
):
def
__init__
(
self
,
P
rogram
,
ops
,
forward_num
,
skip_opt
):
def
__init__
(
self
,
p
rogram
,
ops
,
forward_num
,
skip_opt
):
self
.
_program
=
P
rogram
self
.
_program
=
p
rogram
self
.
_ops
=
ops
self
.
_ops
=
ops
self
.
_forward_num
=
forward_num
self
.
_forward_num
=
forward_num
self
.
_successors
=
defaultdict
(
set
)
self
.
_successors
=
defaultdict
(
set
)
...
@@ -51,6 +54,7 @@ class ControlFlowGraph(object):
...
@@ -51,6 +54,7 @@ class ControlFlowGraph(object):
self
.
_skip_opt
=
skip_opt
self
.
_skip_opt
=
skip_opt
def
_add_connections
(
self
,
connections
):
def
_add_connections
(
self
,
connections
):
"""Populates _successors and _presuccessors for two neighbor nodes."""
for
node1
,
node2
in
connections
:
for
node1
,
node2
in
connections
:
self
.
_add
(
node1
,
node2
)
self
.
_add
(
node1
,
node2
)
...
@@ -58,7 +62,11 @@ class ControlFlowGraph(object):
...
@@ -58,7 +62,11 @@ class ControlFlowGraph(object):
self
.
_successors
[
node1
].
add
(
node2
)
self
.
_successors
[
node1
].
add
(
node2
)
self
.
_presuccessors
[
node2
].
add
(
node1
)
self
.
_presuccessors
[
node2
].
add
(
node1
)
# TODO(panyx0718): We need to have a unified way of building intermediate
# representation.
def
_build_graph
(
self
):
def
_build_graph
(
self
):
"""Build a graph based on op sequence.
"""
self
.
op_size
=
len
(
self
.
_ops
)
self
.
op_size
=
len
(
self
.
_ops
)
op_node_connections
=
[(
i
,
i
+
1
)
for
i
in
range
(
self
.
op_size
-
1
)]
op_node_connections
=
[(
i
,
i
+
1
)
for
i
in
range
(
self
.
op_size
-
1
)]
self
.
_add_connections
(
op_node_connections
)
self
.
_add_connections
(
op_node_connections
)
...
@@ -82,15 +90,14 @@ class ControlFlowGraph(object):
...
@@ -82,15 +90,14 @@ class ControlFlowGraph(object):
self
.
_live_out
[
i
].
add
(
new_name
)
self
.
_live_out
[
i
].
add
(
new_name
)
def
_reach_fixed_point
(
self
,
live_in
,
live_out
):
def
_reach_fixed_point
(
self
,
live_in
,
live_out
):
"""Check if the liveness set has stablized."""
if
len
(
live_in
)
!=
len
(
self
.
_live_in
):
if
len
(
live_in
)
!=
len
(
self
.
_live_in
):
return
False
return
False
if
len
(
live_out
)
!=
len
(
self
.
_live_out
):
if
len
(
live_out
)
!=
len
(
self
.
_live_out
):
return
False
return
False
for
i
in
range
(
self
.
op_size
):
for
i
in
range
(
self
.
op_size
):
if
live_in
[
i
]
!=
self
.
_live_in
[
i
]:
if
(
live_in
[
i
]
!=
self
.
_live_in
[
i
]
or
return
False
live_out
[
i
]
!=
self
.
_live_out
[
i
]):
for
i
in
range
(
self
.
op_size
):
if
live_out
[
i
]
!=
self
.
_live_out
[
i
]:
return
False
return
False
return
True
return
True
...
@@ -98,6 +105,8 @@ class ControlFlowGraph(object):
...
@@ -98,6 +105,8 @@ class ControlFlowGraph(object):
self
.
_build_graph
()
self
.
_build_graph
()
live_in
=
defaultdict
(
set
)
live_in
=
defaultdict
(
set
)
live_out
=
defaultdict
(
set
)
live_out
=
defaultdict
(
set
)
# Repeatedly apply liveness updates until the algorithm stablize
# on a complete set live input vars and live output vars.
while
True
:
while
True
:
for
i
in
range
(
self
.
op_size
,
0
,
-
1
):
for
i
in
range
(
self
.
op_size
,
0
,
-
1
):
live_in
[
i
]
=
set
(
self
.
_live_in
[
i
])
live_in
[
i
]
=
set
(
self
.
_live_in
[
i
])
...
@@ -141,6 +150,8 @@ class ControlFlowGraph(object):
...
@@ -141,6 +150,8 @@ class ControlFlowGraph(object):
return
False
return
False
return
True
return
True
# TODO(panyx0718): This needs to be less hacky. It seems memory optimization
# doesn't consider vars copied between cpu and gpu.
def
_update_skip_opt_set
(
self
):
def
_update_skip_opt_set
(
self
):
for
i
in
range
(
self
.
op_size
):
for
i
in
range
(
self
.
op_size
):
op
=
self
.
_ops
[
i
]
op
=
self
.
_ops
[
i
]
...
@@ -154,7 +165,7 @@ class ControlFlowGraph(object):
...
@@ -154,7 +165,7 @@ class ControlFlowGraph(object):
bwd_id
=
0
bwd_id
=
0
for
i
in
range
(
self
.
op_size
):
for
i
in
range
(
self
.
op_size
):
op
=
self
.
_ops
[
i
]
op
=
self
.
_ops
[
i
]
if
op
.
type
()
in
sub_block_ops
:
if
op
.
type
()
in
SUB_BLOCK_OPS
:
continue
continue
block_desc
=
op
.
block
()
block_desc
=
op
.
block
()
is_forward
=
i
<
self
.
_forward_num
is_forward
=
i
<
self
.
_forward_num
...
@@ -177,13 +188,15 @@ class ControlFlowGraph(object):
...
@@ -177,13 +188,15 @@ class ControlFlowGraph(object):
def
compare_shape
(
x_shape
,
cache_shape
,
opt_level
):
def
compare_shape
(
x_shape
,
cache_shape
,
opt_level
):
if
opt_level
==
0
:
if
opt_level
==
0
:
return
x_shape
==
cache_shape
return
x_shape
==
cache_shape
if
opt_level
==
1
:
el
if
opt_level
==
1
:
if
(
x_shape
[
0
]
==
-
1
)
^
(
cache_shape
[
0
]
==
-
1
):
if
(
x_shape
[
0
]
==
-
1
)
^
(
cache_shape
[
0
]
==
-
1
):
return
False
return
False
x_size
=
abs
(
reduce
(
lambda
x
,
y
:
x
*
y
,
x_shape
))
x_size
=
abs
(
reduce
(
lambda
x
,
y
:
x
*
y
,
x_shape
))
cache_size
=
abs
(
reduce
(
lambda
x
,
y
:
x
*
y
,
cache_shape
))
cache_size
=
abs
(
reduce
(
lambda
x
,
y
:
x
*
y
,
cache_shape
))
if
x_size
<=
cache_size
:
if
x_size
<=
cache_size
:
return
True
return
True
else
:
raise
ValueError
(
"only support opt_level 0 or 1."
)
return
False
return
False
self
.
_dataflow_analyze
()
self
.
_dataflow_analyze
()
...
@@ -191,10 +204,9 @@ class ControlFlowGraph(object):
...
@@ -191,10 +204,9 @@ class ControlFlowGraph(object):
self
.
pool
=
[]
self
.
pool
=
[]
for
i
in
range
(
self
.
op_size
):
for
i
in
range
(
self
.
op_size
):
op
=
self
.
_ops
[
i
]
op
=
self
.
_ops
[
i
]
if
op
.
type
()
in
sub_block_ops
:
if
op
.
type
()
in
SUB_BLOCK_OPS
:
continue
continue
block_desc
=
op
.
block
()
block_desc
=
op
.
block
()
self
.
current_block_desc
=
block_desc
is_forward
=
i
<
self
.
_forward_num
is_forward
=
i
<
self
.
_forward_num
if
self
.
pool
:
if
self
.
pool
:
defs_can_optimize
=
filter
(
defs_can_optimize
=
filter
(
...
@@ -211,37 +223,40 @@ class ControlFlowGraph(object):
...
@@ -211,37 +223,40 @@ class ControlFlowGraph(object):
for
index
,
cache_pair
in
enumerate
(
self
.
pool
):
for
index
,
cache_pair
in
enumerate
(
self
.
pool
):
cache_var
=
cache_pair
[
0
]
cache_var
=
cache_pair
[
0
]
cache_shape
=
cache_pair
[
1
]
cache_shape
=
cache_pair
[
1
]
if
compare_shape
(
x_shape
,
cache_shape
,
level
):
if
not
compare_shape
(
x_shape
,
cache_shape
,
level
):
if
self
.
_has_var
(
block_desc
,
cache_var
,
is_forward
):
continue
x_dtype
=
self
.
_find_var
(
block_desc
,
x
,
is_forward
).
dtype
()
if
not
self
.
_has_var
(
block_desc
,
cache_var
,
is_forward
):
cache_dtype
=
self
.
_find_var
(
continue
block_desc
,
cache_var
,
is_forward
).
dtype
()
# TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
x_dtype
=
self
.
_find_var
(
block_desc
,
x
,
# and dtype_to_size[cache_dtype]
is_forward
).
dtype
()
if
x_dtype
==
cache_dtype
:
cache_dtype
=
self
.
_find_var
(
block_desc
,
cache_var
,
if
PRINT_LOG
:
is_forward
).
dtype
()
print
(
# TODO(qijun): actually, we should compare
(
"Hit Cache !!!! cache pool index "
# dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
"is %d, var name is %s, "
if
x_dtype
!=
cache_dtype
:
"cached var name is %s, "
continue
"var shape is %s "
)
%
(
index
,
x
,
cache_var
,
if
PRINT_LOG
:
str
(
cache_shape
)))
print
((
"Hit Cache !!!! cache pool index "
self
.
pool
.
pop
(
index
)
"is %d, var name is %s, "
if
x
==
cache_var
:
"cached var name is %s, "
break
"var shape is %s "
)
%
(
index
,
x
,
cache_var
,
_rename_arg_
(
str
(
cache_shape
)))
self
.
_ops
,
x
,
cache_var
,
begin_idx
=
i
)
self
.
pool
.
pop
(
index
)
self
.
_program
.
block
(
block_desc
.
id
).
var
(
if
x
==
cache_var
:
str
(
x
)).
desc
=
self
.
_find_var
(
break
block_desc
,
cache_var
,
is_forward
)
# Rename the var to the cache var already with
self
.
_update_graph
(
# memory allocated in order to reuse the memory.
x
,
cache_var
,
begin_idx
=
i
)
_rename_arg_
(
self
.
_ops
,
x
,
cache_var
,
begin_idx
=
i
)
break
self
.
_program
.
block
(
block_desc
.
id
).
var
(
str
(
x
)).
desc
=
self
.
_find_var
(
block_desc
,
cache_var
,
in_diff
,
out_diff
=
self
.
_get_diff
(
self
.
_live_in
[
i
],
is_forward
)
self
.
_live_out
[
i
])
self
.
_update_graph
(
x
,
cache_var
,
begin_idx
=
i
)
break
in_diff
,
_
=
self
.
_get_diff
(
self
.
_live_in
[
i
],
self
.
_live_out
[
i
])
can_optimize
=
filter
(
can_optimize
=
filter
(
lambda
x
:
self
.
_check_var_validity
(
block_desc
,
x
,
is_forward
),
lambda
x
:
self
.
_check_var_validity
(
block_desc
,
x
,
is_forward
),
in_diff
)
in_diff
)
...
@@ -252,6 +267,19 @@ class ControlFlowGraph(object):
...
@@ -252,6 +267,19 @@ class ControlFlowGraph(object):
def
_process_sub_block_pair
(
pdesc
,
sub_block_pair
):
def
_process_sub_block_pair
(
pdesc
,
sub_block_pair
):
"""Creates a list of tuple each of which tracks info of a subblock.
Note: this function doesn't handle nested subblocks yet.
TODO(panyx0718): assert if case nested subblocks happen.
:param pdesc: ProgramDesc.
:param sub_block_pair: A list op pairs. Each op pair is the forward
op and backward op. The ops in the list are special that they contain
a subblock of ops.
:return: A list of tuples, each tuple is (all ops in a subblock pair
including forward and backward, number of forward ops,
all output args names of the ops in the subblock pairs).
"""
ops_list
=
[]
ops_list
=
[]
block_desc
=
pdesc
.
block
(
0
)
block_desc
=
pdesc
.
block
(
0
)
op_size
=
block_desc
.
op_size
()
op_size
=
block_desc
.
op_size
()
...
@@ -308,6 +336,11 @@ def _process_sub_block_pair(pdesc, sub_block_pair):
...
@@ -308,6 +336,11 @@ def _process_sub_block_pair(pdesc, sub_block_pair):
def
_get_cfgs
(
input_program
):
def
_get_cfgs
(
input_program
):
"""Process each block and create ControlFlowGraph for each of them.
:param input_program: Program object.
:return: A list of ControlFlowGraph, each corresponds to a block.
"""
ops_list
=
[]
ops_list
=
[]
pdesc
=
input_program
.
get_desc
()
pdesc
=
input_program
.
get_desc
()
block_desc
=
pdesc
.
block
(
0
)
block_desc
=
pdesc
.
block
(
0
)
...
@@ -316,11 +349,8 @@ def _get_cfgs(input_program):
...
@@ -316,11 +349,8 @@ def _get_cfgs(input_program):
ops_list
.
append
(
ops_list
.
append
(
([
block_desc
.
op
(
i
)
for
i
in
range
(
op_size
)],
op_size
,
set
()))
([
block_desc
.
op
(
i
)
for
i
in
range
(
op_size
)],
op_size
,
set
()))
sub_block_pair
=
[(
"while"
,
"while_grad"
),
(
"parallel_do"
,
# Only process one level of nested subblock.
"parallel_do_grad"
),
ops_list
.
extend
(
_process_sub_block_pair
(
pdesc
,
SUB_BLOCK_PAIR
))
(
"conditional_block"
,
"conditional_block_grad"
)]
ops_list
.
extend
(
_process_sub_block_pair
(
pdesc
,
sub_block_pair
))
cfgs
=
[
cfgs
=
[
ControlFlowGraph
(
input_program
,
ops
,
forward_num
,
skip_opt
)
ControlFlowGraph
(
input_program
,
ops
,
forward_num
,
skip_opt
)
...
@@ -330,6 +360,17 @@ def _get_cfgs(input_program):
...
@@ -330,6 +360,17 @@ def _get_cfgs(input_program):
def
memory_optimize
(
input_program
,
print_log
=
False
,
level
=
0
):
def
memory_optimize
(
input_program
,
print_log
=
False
,
level
=
0
):
"""Optimize memory by reusing var memory.
Note: it doesn't not support subblock nested in subblock.
:param input_program: Input Program
:param print_log: whether to print debug log.
:param level: If level=0, reuse if the shape is completely equal, o
:return:
"""
if
level
!=
0
and
level
!=
1
:
raise
ValueError
(
"only support opt_level 0 or 1."
)
global
PRINT_LOG
global
PRINT_LOG
PRINT_LOG
=
print_log
PRINT_LOG
=
print_log
cfgs
=
_get_cfgs
(
input_program
)
cfgs
=
_get_cfgs
(
input_program
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录