diff --git a/mace/python/tools/BUILD b/mace/python/tools/BUILD index b1bb214cb7153324924e05ddc81868c94f09b73a..675f12acb73ee99e810c9add14087ebc63408812 100644 --- a/mace/python/tools/BUILD +++ b/mace/python/tools/BUILD @@ -8,6 +8,7 @@ py_library( ], srcs_version = "PY2AND3", deps = [ + ":memory_optimizer", "//mace/proto:mace_py", ], ) diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..8841ba577c704687efb9761a5a1e65eab7f4cbda --- /dev/null +++ b/mace/python/tools/memory_optimizer.py @@ -0,0 +1,89 @@ +import sys +import operator +from mace.proto import mace_pb2 + +class MemoryOptimizer(object): + def __init__(self, net_def): + self.net_def = net_def + self.idle_mem = set() + self.op_mem = {} # op_name->mem_id + self.mem_block = {} # mem_id->[x, y] + self.total_mem_count = 0 + self.ref_counter = {} + + consumers = {} + for op in net_def.op: + if self.is_buffer_image_op(op): + continue + for ipt in op.input: + if ipt not in consumers: + consumers[ipt] = [] + consumers[ipt].append(op) + # only ref op's output tensor + for op in net_def.op: + if self.is_buffer_image_op(op): + continue + tensor_name = self._op_to_tensor(op) + if tensor_name in consumers: + self.ref_counter[tensor_name] = len(consumers[tensor_name]) + else: + self.ref_counter[tensor_name] = 0 + + def _op_to_tensor(self, op): + return op.name + ':0' + + def is_buffer_image_op(self, op): + return op.type == 'BufferToImage' or op.type == 'ImageToBuffer' + + def optimize(self): + for op in self.net_def.op: + if self.is_buffer_image_op(op): + continue + if len(self.idle_mem) == 0: + # allocate new mem + mem_id = self.total_mem_count + self.total_mem_count += 1 + else: + # reuse mem + mem_id = self.idle_mem.pop() + + op.mem_id = mem_id + self.op_mem[self._op_to_tensor(op)] = mem_id + if mem_id not in self.mem_block: + self.mem_block[mem_id] = [0, 0] + mem_size = self.mem_block[mem_id] + mem_size[1] = max(mem_size[1], op.output_shape[0].dims[0] * op.output_shape[0].dims[1]) + mem_size[0] = max(mem_size[0], op.output_shape[0].dims[2] * (op.output_shape[0].dims[3]+3)/4) + + # de-ref input tensor mem + for ipt in op.input: + if ipt in self.ref_counter: + self.ref_counter[ipt] -= 1 + if self.ref_counter[ipt] == 0: + self.idle_mem.add(self.op_mem[ipt]) + elif self.ref_counter[ipt] < 0: + raise Exception('ref count is less than 0') + + for mem in self.mem_block: + arena = self.net_def.mem_arena + block = arena.mem_block.add() + block.mem_id = mem + block.x = self.mem_block[mem][0] + block.y = self.mem_block[mem][1] + + print('total op: %d', len(self.net_def.op)) + origin_mem_size = 0 + optimized_mem_size = 0 + for op in self.net_def.op: + if self.is_buffer_image_op(op): + continue + origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1) + for mem in self.mem_block: + optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4) + + print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size) + + +def optimize_memory(net_def): + mem_optimizer = MemoryOptimizer(net_def) + mem_optimizer.optimize() \ No newline at end of file diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py index 90fa901ec941484f95d720f333d1644b834c8e8b..99094b886d9a89dacf881c7fbcfc2eb8c6563e8a 100644 --- a/mace/python/tools/tf_converter_lib.py +++ b/mace/python/tools/tf_converter_lib.py @@ -1,8 +1,7 @@ -import operator -import sys from mace.proto import mace_pb2 import tensorflow as tf import numpy as np +from mace.python.tools import memory_optimizer # TODO: support NCHW formt, now only support NHWC. padding_mode = { @@ -46,12 +45,6 @@ class TFConverter(object): self.tf_graph = {} self.resolved_ops = {} - self.idle_mem = set() - self.op_mem = {} # op_name->mem_id - self.mem_block = {} # mem_id->[x, y] - self.total_mem_count = 0 - self.ref_counter = {} - for op in tf_ops: self.resolved_ops[op.name] = 0 for input in op.inputs: @@ -424,83 +417,6 @@ class TFConverter(object): if self.resolved_ops[key] != 1: print 'Unresolve Op: %s' % key - @staticmethod - def _op_to_tensor(op): - return op.name + ':0' - - @staticmethod - def is_buffer_image_op(op): - return op.type == 'BufferToImage' or op.type == 'ImageToBuffer' - - def optimize(self): - consumers = {} - for op in self.net_def.op: - if self.is_buffer_image_op(op): - continue - for ipt in op.input: - if ipt not in consumers: - consumers[ipt] = [] - consumers[ipt].append(op) - # only ref op's output tensor - for op in self.net_def.op: - if self.is_buffer_image_op(op): - continue - tensor_name = self._op_to_tensor(op) - if tensor_name in consumers: - self.ref_counter[tensor_name] = len(consumers[tensor_name]) - else: - self.ref_counter[tensor_name] = 0 - - for op in self.net_def.op: - if self.is_buffer_image_op(op): - continue - if not op.output_shape: - print "Op %s don't have output shape information, No way to optimize memory." % op.name - return - if len(self.idle_mem) == 0: - # allocate new mem - mem_id = self.total_mem_count - self.total_mem_count += 1 - else: - # reuse mem - mem_id = self.idle_mem.pop() - - op.mem_id = mem_id - self.op_mem[self._op_to_tensor(op)] = mem_id - if mem_id not in self.mem_block: - self.mem_block[mem_id] = [0, 0] - mem_size = self.mem_block[mem_id] - mem_size[1] = max(mem_size[1], op.output_shape[0].dims[0] * op.output_shape[0].dims[1]) - mem_size[0] = max(mem_size[0], op.output_shape[0].dims[2] * (op.output_shape[0].dims[3]+3)/4) - - # de-ref input tensor mem - for ipt in op.input: - if ipt in self.ref_counter: - self.ref_counter[ipt] -= 1 - if self.ref_counter[ipt] == 0: - self.idle_mem.add(self.op_mem[ipt]) - elif self.ref_counter[ipt] < 0: - raise Exception('ref count is less than 0') - - for mem in self.mem_block: - arena = self.net_def.mem_arena - block = arena.mem_block.add() - block.mem_id = mem - block.x = self.mem_block[mem][0] - block.y = self.mem_block[mem][1] - - print('total op: %d', len(self.net_def.op)) - origin_mem_size = 0 - optimized_mem_size = 0 - for op in self.net_def.op: - if self.is_buffer_image_op(op): - continue - origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1) - for mem in self.mem_block: - optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4) - - print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size) - def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, device): net_def = mace_pb2.NetDef() dt = data_type_map[data_type] @@ -512,7 +428,8 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, devi converter = TFConverter(ops, net_def, dt, device) converter.convert(input_node, output_node) print "PB Converted, start optimize memory." - converter.optimize() + mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) + mem_optimizer.optimize() print "Memory optimization done." return net_def