未验证 提交 2f382640 编写于 作者: X Xin Pan 提交者: GitHub

Merge pull request #9905 from panyx0718/mem-opt

Polish memory optimization transpiler
...@@ -29,17 +29,20 @@ dtype_to_size = { ...@@ -29,17 +29,20 @@ dtype_to_size = {
core.VarDesc.VarType.BOOL: 1 core.VarDesc.VarType.BOOL: 1
} }
sub_block_ops = [ SUB_BLOCK_OPS = [
"while", "while_grad", "parallel_do", "parallel_do_grad", "while", "while_grad", "parallel_do", "parallel_do_grad",
"conditional_block", "conditional_block_grad" "conditional_block", "conditional_block_grad"
] ]
SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
("conditional_block", "conditional_block_grad")]
PRINT_LOG = False PRINT_LOG = False
class ControlFlowGraph(object): class ControlFlowGraph(object):
def __init__(self, Program, ops, forward_num, skip_opt): def __init__(self, program, ops, forward_num, skip_opt):
self._program = Program self._program = program
self._ops = ops self._ops = ops
self._forward_num = forward_num self._forward_num = forward_num
self._successors = defaultdict(set) self._successors = defaultdict(set)
...@@ -51,6 +54,7 @@ class ControlFlowGraph(object): ...@@ -51,6 +54,7 @@ class ControlFlowGraph(object):
self._skip_opt = skip_opt self._skip_opt = skip_opt
def _add_connections(self, connections): def _add_connections(self, connections):
"""Populates _successors and _presuccessors for two neighbor nodes."""
for node1, node2 in connections: for node1, node2 in connections:
self._add(node1, node2) self._add(node1, node2)
...@@ -58,7 +62,11 @@ class ControlFlowGraph(object): ...@@ -58,7 +62,11 @@ class ControlFlowGraph(object):
self._successors[node1].add(node2) self._successors[node1].add(node2)
self._presuccessors[node2].add(node1) self._presuccessors[node2].add(node1)
# TODO(panyx0718): We need to have a unified way of building intermediate
# representation.
def _build_graph(self): def _build_graph(self):
"""Build a graph based on op sequence.
"""
self.op_size = len(self._ops) self.op_size = len(self._ops)
op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)] op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
self._add_connections(op_node_connections) self._add_connections(op_node_connections)
...@@ -82,15 +90,14 @@ class ControlFlowGraph(object): ...@@ -82,15 +90,14 @@ class ControlFlowGraph(object):
self._live_out[i].add(new_name) self._live_out[i].add(new_name)
def _reach_fixed_point(self, live_in, live_out): def _reach_fixed_point(self, live_in, live_out):
"""Check if the liveness set has stablized."""
if len(live_in) != len(self._live_in): if len(live_in) != len(self._live_in):
return False return False
if len(live_out) != len(self._live_out): if len(live_out) != len(self._live_out):
return False return False
for i in range(self.op_size): for i in range(self.op_size):
if live_in[i] != self._live_in[i]: if (live_in[i] != self._live_in[i] or
return False live_out[i] != self._live_out[i]):
for i in range(self.op_size):
if live_out[i] != self._live_out[i]:
return False return False
return True return True
...@@ -98,6 +105,8 @@ class ControlFlowGraph(object): ...@@ -98,6 +105,8 @@ class ControlFlowGraph(object):
self._build_graph() self._build_graph()
live_in = defaultdict(set) live_in = defaultdict(set)
live_out = defaultdict(set) live_out = defaultdict(set)
# Repeatedly apply liveness updates until the algorithm stablize
# on a complete set live input vars and live output vars.
while True: while True:
for i in range(self.op_size, 0, -1): for i in range(self.op_size, 0, -1):
live_in[i] = set(self._live_in[i]) live_in[i] = set(self._live_in[i])
...@@ -141,6 +150,8 @@ class ControlFlowGraph(object): ...@@ -141,6 +150,8 @@ class ControlFlowGraph(object):
return False return False
return True return True
# TODO(panyx0718): This needs to be less hacky. It seems memory optimization
# doesn't consider vars copied between cpu and gpu.
def _update_skip_opt_set(self): def _update_skip_opt_set(self):
for i in range(self.op_size): for i in range(self.op_size):
op = self._ops[i] op = self._ops[i]
...@@ -154,7 +165,7 @@ class ControlFlowGraph(object): ...@@ -154,7 +165,7 @@ class ControlFlowGraph(object):
bwd_id = 0 bwd_id = 0
for i in range(self.op_size): for i in range(self.op_size):
op = self._ops[i] op = self._ops[i]
if op.type() in sub_block_ops: if op.type() in SUB_BLOCK_OPS:
continue continue
block_desc = op.block() block_desc = op.block()
is_forward = i < self._forward_num is_forward = i < self._forward_num
...@@ -177,13 +188,15 @@ class ControlFlowGraph(object): ...@@ -177,13 +188,15 @@ class ControlFlowGraph(object):
def compare_shape(x_shape, cache_shape, opt_level): def compare_shape(x_shape, cache_shape, opt_level):
if opt_level == 0: if opt_level == 0:
return x_shape == cache_shape return x_shape == cache_shape
if opt_level == 1: elif opt_level == 1:
if (x_shape[0] == -1) ^ (cache_shape[0] == -1): if (x_shape[0] == -1) ^ (cache_shape[0] == -1):
return False return False
x_size = abs(reduce(lambda x, y: x * y, x_shape)) x_size = abs(reduce(lambda x, y: x * y, x_shape))
cache_size = abs(reduce(lambda x, y: x * y, cache_shape)) cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
if x_size <= cache_size: if x_size <= cache_size:
return True return True
else:
raise ValueError("only support opt_level 0 or 1.")
return False return False
self._dataflow_analyze() self._dataflow_analyze()
...@@ -191,10 +204,9 @@ class ControlFlowGraph(object): ...@@ -191,10 +204,9 @@ class ControlFlowGraph(object):
self.pool = [] self.pool = []
for i in range(self.op_size): for i in range(self.op_size):
op = self._ops[i] op = self._ops[i]
if op.type() in sub_block_ops: if op.type() in SUB_BLOCK_OPS:
continue continue
block_desc = op.block() block_desc = op.block()
self.current_block_desc = block_desc
is_forward = i < self._forward_num is_forward = i < self._forward_num
if self.pool: if self.pool:
defs_can_optimize = filter( defs_can_optimize = filter(
...@@ -211,37 +223,40 @@ class ControlFlowGraph(object): ...@@ -211,37 +223,40 @@ class ControlFlowGraph(object):
for index, cache_pair in enumerate(self.pool): for index, cache_pair in enumerate(self.pool):
cache_var = cache_pair[0] cache_var = cache_pair[0]
cache_shape = cache_pair[1] cache_shape = cache_pair[1]
if compare_shape(x_shape, cache_shape, level): if not compare_shape(x_shape, cache_shape, level):
if self._has_var(block_desc, cache_var, is_forward): continue
x_dtype = self._find_var(block_desc, x,
is_forward).dtype() if not self._has_var(block_desc, cache_var, is_forward):
cache_dtype = self._find_var( continue
block_desc, cache_var, is_forward).dtype()
# TODO(qijun): actually, we should compare dtype_to_size[x_dtype] x_dtype = self._find_var(block_desc, x,
# and dtype_to_size[cache_dtype] is_forward).dtype()
if x_dtype == cache_dtype: cache_dtype = self._find_var(block_desc, cache_var,
if PRINT_LOG: is_forward).dtype()
print( # TODO(qijun): actually, we should compare
("Hit Cache !!!! cache pool index " # dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
"is %d, var name is %s, " if x_dtype != cache_dtype:
"cached var name is %s, " continue
"var shape is %s ") %
(index, x, cache_var, if PRINT_LOG:
str(cache_shape))) print(("Hit Cache !!!! cache pool index "
self.pool.pop(index) "is %d, var name is %s, "
if x == cache_var: "cached var name is %s, "
break "var shape is %s ") % (index, x, cache_var,
_rename_arg_( str(cache_shape)))
self._ops, x, cache_var, begin_idx=i) self.pool.pop(index)
self._program.block(block_desc.id).var( if x == cache_var:
str(x)).desc = self._find_var( break
block_desc, cache_var, is_forward) # Rename the var to the cache var already with
self._update_graph( # memory allocated in order to reuse the memory.
x, cache_var, begin_idx=i) _rename_arg_(self._ops, x, cache_var, begin_idx=i)
break self._program.block(block_desc.id).var(str(
x)).desc = self._find_var(block_desc, cache_var,
in_diff, out_diff = self._get_diff(self._live_in[i], is_forward)
self._live_out[i]) self._update_graph(x, cache_var, begin_idx=i)
break
in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
can_optimize = filter( can_optimize = filter(
lambda x: self._check_var_validity(block_desc, x, is_forward), lambda x: self._check_var_validity(block_desc, x, is_forward),
in_diff) in_diff)
...@@ -252,6 +267,19 @@ class ControlFlowGraph(object): ...@@ -252,6 +267,19 @@ class ControlFlowGraph(object):
def _process_sub_block_pair(pdesc, sub_block_pair): def _process_sub_block_pair(pdesc, sub_block_pair):
"""Creates a list of tuple each of which tracks info of a subblock.
Note: this function doesn't handle nested subblocks yet.
TODO(panyx0718): assert if case nested subblocks happen.
:param pdesc: ProgramDesc.
:param sub_block_pair: A list op pairs. Each op pair is the forward
op and backward op. The ops in the list are special that they contain
a subblock of ops.
:return: A list of tuples, each tuple is (all ops in a subblock pair
including forward and backward, number of forward ops,
all output args names of the ops in the subblock pairs).
"""
ops_list = [] ops_list = []
block_desc = pdesc.block(0) block_desc = pdesc.block(0)
op_size = block_desc.op_size() op_size = block_desc.op_size()
...@@ -308,6 +336,11 @@ def _process_sub_block_pair(pdesc, sub_block_pair): ...@@ -308,6 +336,11 @@ def _process_sub_block_pair(pdesc, sub_block_pair):
def _get_cfgs(input_program): def _get_cfgs(input_program):
"""Process each block and create ControlFlowGraph for each of them.
:param input_program: Program object.
:return: A list of ControlFlowGraph, each corresponds to a block.
"""
ops_list = [] ops_list = []
pdesc = input_program.get_desc() pdesc = input_program.get_desc()
block_desc = pdesc.block(0) block_desc = pdesc.block(0)
...@@ -316,11 +349,8 @@ def _get_cfgs(input_program): ...@@ -316,11 +349,8 @@ def _get_cfgs(input_program):
ops_list.append( ops_list.append(
([block_desc.op(i) for i in range(op_size)], op_size, set())) ([block_desc.op(i) for i in range(op_size)], op_size, set()))
sub_block_pair = [("while", "while_grad"), ("parallel_do", # Only process one level of nested subblock.
"parallel_do_grad"), ops_list.extend(_process_sub_block_pair(pdesc, SUB_BLOCK_PAIR))
("conditional_block", "conditional_block_grad")]
ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair))
cfgs = [ cfgs = [
ControlFlowGraph(input_program, ops, forward_num, skip_opt) ControlFlowGraph(input_program, ops, forward_num, skip_opt)
...@@ -330,6 +360,17 @@ def _get_cfgs(input_program): ...@@ -330,6 +360,17 @@ def _get_cfgs(input_program):
def memory_optimize(input_program, print_log=False, level=0): def memory_optimize(input_program, print_log=False, level=0):
"""Optimize memory by reusing var memory.
Note: it doesn't not support subblock nested in subblock.
:param input_program: Input Program
:param print_log: whether to print debug log.
:param level: If level=0, reuse if the shape is completely equal, o
:return:
"""
if level != 0 and level != 1:
raise ValueError("only support opt_level 0 or 1.")
global PRINT_LOG global PRINT_LOG
PRINT_LOG = print_log PRINT_LOG = print_log
cfgs = _get_cfgs(input_program) cfgs = _get_cfgs(input_program)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册