memory_optimization_transpiler.py 11.3 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
D
dzhwinter 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
D
dzhwinter 已提交
9 10 11 12 13 14
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15 16 17 18 19
from collections import defaultdict
import framework
from framework import Program, default_main_program, Parameter, Variable
import backward
from backward import _rename_arg_
20 21 22
from . import core

dtype_to_size = {
23 24 25 26 27 28 29
    core.VarDesc.VarType.FP16: 2,
    core.VarDesc.VarType.FP32: 4,
    core.VarDesc.VarType.FP64: 8,
    core.VarDesc.VarType.INT16: 2,
    core.VarDesc.VarType.INT32: 4,
    core.VarDesc.VarType.INT64: 8,
    core.VarDesc.VarType.BOOL: 1
30
}
31

32 33
sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"]

Q
qiaolongfei 已提交
34 35
PRINT_LOG = False

36 37

class ControlFlowGraph(object):
38
    def __init__(self, Program, ops, forward_num, skip_opt):
39
        self._program = Program
40 41 42 43
        self._ops = ops
        self._forward_num = forward_num
        self._successors = defaultdict(set)
        self._presuccessors = defaultdict(set)
44 45 46 47
        self._uses = defaultdict(set)
        self._defs = defaultdict(set)
        self._live_in = defaultdict(set)
        self._live_out = defaultdict(set)
48
        self._skip_opt = skip_opt
49 50 51 52 53 54

    def _add_connections(self, connections):
        for node1, node2 in connections:
            self._add(node1, node2)

    def _add(self, node1, node2):
55 56
        self._successors[node1].add(node2)
        self._presuccessors[node2].add(node1)
57 58

    def _build_graph(self):
59
        self.op_size = len(self._ops)
60 61 62
        op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
        self._add_connections(op_node_connections)
        for i in range(self.op_size):
63 64
            self._uses[i].update(self._ops[i].input_arg_names())
            self._defs[i].update(self._ops[i].output_arg_names())
65

66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
    def _update_graph(self, old_name, new_name, begin_idx=0):
        for i in range(begin_idx, self.op_size):
            if old_name in self._uses[i]:
                self._uses[i].remove(old_name)
                self._uses[i].add(new_name)
            if old_name in self._defs[i]:
                self._defs[i].remove(old_name)
                self._defs[i].add(new_name)
            if old_name in self._live_in[i]:
                self._live_in[i].remove(old_name)
                self._live_out[i].add(new_name)
            if old_name in self._live_out[i]:
                self._live_out[i].remove(old_name)
                self._live_out[i].add(new_name)

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
    def _reach_fixed_point(self, live_in, live_out):
        if len(live_in) != len(self._live_in):
            return False
        if len(live_out) != len(self._live_out):
            return False
        for i in range(self.op_size):
            if live_in[i] != self._live_in[i]:
                return False
        for i in range(self.op_size):
            if live_out[i] != self._live_out[i]:
                return False
        return True

    def _dataflow_analyze(self):
        self._build_graph()
        live_in = defaultdict(set)
        live_out = defaultdict(set)
        while True:
Q
QI JUN 已提交
99
            for i in range(self.op_size, 0, -1):
100 101
                live_in[i] = set(self._live_in[i])
                live_out[i] = set(self._live_out[i])
102
                for s in self._successors[i]:
103
                    self._live_out[i] |= self._live_in[s]
Q
QI JUN 已提交
104 105
                self._live_in[i] = self._uses[i] | (
                    self._live_out[i] - self._defs[i])
106 107 108 109 110 111 112
            if self._reach_fixed_point(live_in, live_out):
                break

    def _get_diff(self, a, b):
        u = a & b
        return a - u, b - u

113 114 115 116 117 118 119 120 121 122 123 124
    def _has_var(self, block_desc, var_name, is_forward):
        if is_forward:
            return block_desc.has_var(str(var_name))
        else:
            return block_desc.has_var_recursive(str(var_name))

    def _find_var(self, block_desc, var_name, is_forward):
        if is_forward:
            return block_desc.find_var(str(var_name))
        else:
            return block_desc.find_var_recursive(str(var_name))

125
    def memory_optimize(self):
126 127 128 129 130 131 132 133 134 135 136
        def check_var_validity(block_desc, x, is_forward):
            if str(x) == "@EMPTY@":
                return False
            if not self._has_var(block_desc, x, is_forward):
                return False
            if self._find_var(block_desc, x, is_forward).persistable():
                return False
            if self._find_var(
                    block_desc, x,
                    is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
                return False
137 138 139 140
            if x in self._skip_opt:
                return False
            if not self._find_var(block_desc, x, is_forward).shape():
                return False
141 142
            return True

143 144 145 146
        self._build_graph()
        self._dataflow_analyze()
        self.pool = []
        for i in range(self.op_size):
147
            op = self._ops[i]
148
            if op.type() in sub_block_ops:
149 150 151
                continue
            block_desc = op.block()
            is_forward = i < self._forward_num
152
            if self.pool:
153 154 155 156 157 158 159
                defs_can_optimize = filter(
                    lambda x: check_var_validity(block_desc, x, is_forward),
                    self._defs[i])
                out_pair = [
                    (x, self._find_var(block_desc, x, is_forward).shape())
                    for x in defs_can_optimize
                ]
160
                for x, x_shape in out_pair:
161 162 163
                    # If x is both in uses and defs, it can not be optimized!
                    if x in self._uses[i]:
                        continue
164 165 166 167 168 169 170 171 172
                    for index, cache_pair in enumerate(self.pool):
                        cache_var = cache_pair[0]
                        cache_shape = cache_pair[1]
                        if x_shape == cache_shape:
                            if self._has_var(block_desc, cache_var, is_forward):
                                x_dtype = self._find_var(block_desc, x,
                                                         is_forward).dtype()
                                cache_dtype = self._find_var(
                                    block_desc, cache_var, is_forward).dtype()
173 174
                                # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
                                # and dtype_to_size[cache_dtype]
Q
fix bug  
qiaolongfei 已提交
175 176 177 178 179 180 181 182 183
                                if x_dtype == cache_dtype:
                                    if PRINT_LOG:
                                        print(
                                            ("Hit Cache !!!! cache pool index "
                                             "is %d, var name is %s, "
                                             "cached var name is %s, "
                                             "var shape is %s ") %
                                            (index, x, cache_var,
                                             str(cache_shape)))
184
                                    self.pool.pop(index)
185 186
                                    if x == cache_var:
                                        break
187
                                    _rename_arg_(
188 189 190 191
                                        self._ops, x, cache_var, begin_idx=i)
                                    self._program.block(block_desc.id).var(
                                        str(x)).desc = self._find_var(
                                            block_desc, cache_var, is_forward)
192 193 194
                                    self._update_graph(
                                        x, cache_var, begin_idx=i)
                                    break
195 196 197 198

            in_diff, out_diff = self._get_diff(self._live_in[i],
                                               self._live_out[i])
            can_optimize = filter(
199
                lambda x: check_var_validity(block_desc, x, is_forward),
200 201 202
                in_diff)
            if can_optimize:
                for var_name in can_optimize:
203 204 205 206
                    self.pool.append((var_name, self._find_var(
                        block_desc, var_name, is_forward).shape()))


207
def _process_sub_block_pair(pdesc, sub_block_pair):
208 209 210
    ops_list = []
    block_desc = pdesc.block(0)
    op_size = block_desc.op_size()
211 212 213 214 215 216 217 218 219 220 221 222 223
    for fwd_op, bwd_op in sub_block_pair:
        sub_block_ids = []
        grad_sub_block_ids = []
        sub_block_id_pair = []
        sub_op_dict = {}
        for i in range(op_size):
            op = block_desc.op(i)
            if op.type() == fwd_op:
                sub_block_ids.append(op.attr("sub_block").id)
                sub_op_dict[op.attr("sub_block").id] = op
            elif op.type() == bwd_op:
                grad_sub_block_ids.append(op.attr("sub_block").id)
                sub_op_dict[op.attr("sub_block").id] = op
224

225 226
        # Find fwd_op/bwd_op block pair
        for grad_id in grad_sub_block_ids:
Q
qijun 已提交
227 228 229 230
            fwd_id = pdesc.block(grad_id).get_forward_block_idx()
            if fwd_id in sub_block_ids:
                sub_block_id_pair.append((fwd_id, grad_id))
                sub_block_ids.remove(fwd_id)
231

232
        # Get fwd_op/bwd_op block ops
Q
qijun 已提交
233
        for fwd_id, grad_id in sub_block_id_pair:
234
            sub_block_ops = []
Q
qijun 已提交
235
            sub_block = pdesc.block(fwd_id)
236 237 238
            block_op_size = sub_block.op_size()
            for i in range(block_op_size):
                sub_block_ops.append(sub_block.op(i))
239

240 241 242 243
            grad_sub_block = pdesc.block(grad_id)
            grad_sub_block_op_size = grad_sub_block.op_size()
            for i in range(grad_sub_block_op_size):
                sub_block_ops.append(grad_sub_block.op(i))
244

245
            sub_op_output = set()
Q
qijun 已提交
246
            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
247 248
            sub_op_output.update(sub_op_dict[grad_id].output_arg_names())
            ops_list.append((sub_block_ops, block_op_size, sub_op_output))
249

250
        # Process rest fwd_op block ops
Q
qijun 已提交
251
        for fwd_id in sub_block_ids:
252
            sub_block_ops = []
Q
qijun 已提交
253
            sub_block = pdesc.block(fwd_id)
254 255 256 257
            sub_block_op_size = sub_block.op_size()
            for i in range(sub_block_op_size):
                sub_block_ops.append(sub_block.op(i))
            sub_op_output = set()
Q
qijun 已提交
258
            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
259 260
            ops_list.append((sub_block_ops, sub_block_op_size, sub_op_output))
    return ops_list
261

262

263 264 265 266 267 268 269 270
def _get_cfgs(input_program):
    ops_list = []
    pdesc = input_program.get_desc()
    block_desc = pdesc.block(0)
    op_size = block_desc.op_size()
    # Get global block ops
    ops_list.append(
        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
271

272 273
    sub_block_pair = [("while", "while_grad"), ("parallel_do",
                                                "parallel_do_grad")]
274

275
    ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair))
276

277 278 279 280
    cfgs = [
        ControlFlowGraph(input_program, ops, forward_num, skip_opt)
        for ops, forward_num, skip_opt in ops_list
    ]
281
    return cfgs
282 283


Q
qiaolongfei 已提交
284 285 286
def memory_optimize(input_program, print_log=False):
    global PRINT_LOG
    PRINT_LOG = print_log
287
    cfgs = _get_cfgs(input_program)
288 289
    for cfg in cfgs:
        cfg.memory_optimize()