diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py deleted file mode 100644 index f64df5bada8f5acf1730a1f5e15227605cadad24..0000000000000000000000000000000000000000 --- a/mace/python/tools/memory_optimizer.py +++ /dev/null @@ -1,102 +0,0 @@ -import sys -import operator -from mace.proto import mace_pb2 - -class MemoryOptimizer(object): - def __init__(self, net_def): - self.net_def = net_def - self.idle_mem = set() - self.op_mem = {} # op_name->mem_id - self.mem_block = {} # mem_id->[x, y] - self.total_mem_count = 0 - self.ref_counter = {} - - consumers = {} - for op in net_def.op: - if self.is_buffer_image_op(op): - continue - for ipt in op.input: - if ipt not in consumers: - consumers[ipt] = [] - consumers[ipt].append(op) - # only ref op's output tensor - for op in net_def.op: - if self.is_buffer_image_op(op): - continue - tensor_name = self._op_to_tensor(op) - if tensor_name in consumers: - self.ref_counter[tensor_name] = len(consumers[tensor_name]) - else: - self.ref_counter[tensor_name] = 0 - - def _op_to_tensor(self, op): - return op.name + ':0' - - def is_buffer_image_op(self, op): - return op.type == 'BufferToImage' or op.type == 'ImageToBuffer' - - def optimize(self): - for op in self.net_def.op: - if self.is_buffer_image_op(op): - continue - if len(self.idle_mem) == 0: - # allocate new mem - mem_id = self.total_mem_count - self.total_mem_count += 1 - else: - # reuse mem - mem_id = self.idle_mem.pop() - - op.mem_id = mem_id - self.op_mem[self._op_to_tensor(op)] = mem_id - if mem_id not in self.mem_block: - self.mem_block[mem_id] = [0, 0] - mem_size = self.mem_block[mem_id] - mem_size[1] = max(mem_size[1], op.output_shape[0].dims[0] * op.output_shape[0].dims[1]) - mem_size[0] = max(mem_size[0], op.output_shape[0].dims[2] * (op.output_shape[0].dims[3]+3)/4) - - # de-ref input tensor mem - for ipt in op.input: - if ipt in self.ref_counter: - self.ref_counter[ipt] -= 1 - if self.ref_counter[ipt] == 0: - self.idle_mem.add(self.op_mem[ipt]) - elif self.ref_counter[ipt] < 0: - raise Exception('ref count is less than 0') - - for mem in self.mem_block: - arena = net_def.mem_arena - block = arena.mem_block.add() - block.mem_id = mem - block.x = self.mem_block[mem][0] - block.y = self.mem_block[mem][1] - - print('total op: %d', len(self.net_def.op)) - origin_mem_size = 0 - optimized_mem_size = 0 - for op in self.net_def.op: - if self.is_buffer_image_op(op): - continue - origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1) - for mem in self.mem_block: - optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4) - - print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size) - -if __name__ == '__main__': - model_file = sys.argv[1] - opt_model_file = sys.argv[2] - with open(model_file, "rb") as f: - net_def = mace_pb2.NetDef() - net_def.ParseFromString(f.read()) - optimizer = MemoryOptimizer(net_def) - optimizer.optimize() - - with open(opt_model_file, "wb") as f: - f.write(net_def.SerializeToString()) - with open(opt_model_file + '_txt', "wb") as f: - net_def.ClearField('tensors') - f.write(str(net_def)) - - - diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py index 85e39673c9e254b76679b24667d8c5abb5401bc6..90fa901ec941484f95d720f333d1644b834c8e8b 100644 --- a/mace/python/tools/tf_converter_lib.py +++ b/mace/python/tools/tf_converter_lib.py @@ -1,3 +1,5 @@ +import operator +import sys from mace.proto import mace_pb2 import tensorflow as tf import numpy as np @@ -44,13 +46,18 @@ class TFConverter(object): self.tf_graph = {} self.resolved_ops = {} + self.idle_mem = set() + self.op_mem = {} # op_name->mem_id + self.mem_block = {} # mem_id->[x, y] + self.total_mem_count = 0 + self.ref_counter = {} + for op in tf_ops: self.resolved_ops[op.name] = 0 for input in op.inputs: input_name = input.name[:-2] if input_name not in self.tf_graph: self.tf_graph[input_name] = [] - print input_name self.tf_graph[input_name].append(op) def add_buffer_to_image(self, input_name, input_type): @@ -104,7 +111,7 @@ class TFConverter(object): def add_output_shape(outputs, op): output_shapes = [] for output in outputs: - if output.shape is not None and not output.shape: + if output.shape.num_elements() is not None: output_shape = mace_pb2.OutputShape() output_shape.dims.extend(output.shape.as_list()) output_shapes.append(output_shape) @@ -209,12 +216,21 @@ class TFConverter(object): def convert_batchnorm(self, op): bn_ops = [] bn_ops.append(op) - for i in range(1, 7): + for i in range(1, 3): if len(self.tf_graph[bn_ops[i-1].name]) == 1 \ and self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]: bn_ops.append(self.tf_graph[bn_ops[i-1].name][0]) else: raise Exception('Invalid BatchNorm Op') + if len(self.tf_graph[bn_ops[2].name]) == 2 \ + and self.tf_graph[bn_ops[2].name][0].type == BATCH_NORM_ORDER[3] \ + and self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]: + bn_ops.append(self.tf_graph[bn_ops[2].name][0]) + bn_ops.append(self.tf_graph[bn_ops[2].name][1]) + else: + raise Exception('Invalid BatchNorm Op') + bn_ops.append(self.tf_graph[bn_ops[4].name][0]) + bn_ops.append(self.tf_graph[bn_ops[3].name][0]) op_def = mace_pb2.OperatorDef() arg = op_def.arg.add() @@ -246,7 +262,7 @@ class TFConverter(object): data_format_arg.s = 'NHWC' self.net_def.op.extend([op_def]) - for i in range(1, 7): + for i in range(0, 7): self.resolved_ops[bn_ops[i].name] = 1 def convert_pooling(self, op): @@ -408,6 +424,83 @@ class TFConverter(object): if self.resolved_ops[key] != 1: print 'Unresolve Op: %s' % key + @staticmethod + def _op_to_tensor(op): + return op.name + ':0' + + @staticmethod + def is_buffer_image_op(op): + return op.type == 'BufferToImage' or op.type == 'ImageToBuffer' + + def optimize(self): + consumers = {} + for op in self.net_def.op: + if self.is_buffer_image_op(op): + continue + for ipt in op.input: + if ipt not in consumers: + consumers[ipt] = [] + consumers[ipt].append(op) + # only ref op's output tensor + for op in self.net_def.op: + if self.is_buffer_image_op(op): + continue + tensor_name = self._op_to_tensor(op) + if tensor_name in consumers: + self.ref_counter[tensor_name] = len(consumers[tensor_name]) + else: + self.ref_counter[tensor_name] = 0 + + for op in self.net_def.op: + if self.is_buffer_image_op(op): + continue + if not op.output_shape: + print "Op %s don't have output shape information, No way to optimize memory." % op.name + return + if len(self.idle_mem) == 0: + # allocate new mem + mem_id = self.total_mem_count + self.total_mem_count += 1 + else: + # reuse mem + mem_id = self.idle_mem.pop() + + op.mem_id = mem_id + self.op_mem[self._op_to_tensor(op)] = mem_id + if mem_id not in self.mem_block: + self.mem_block[mem_id] = [0, 0] + mem_size = self.mem_block[mem_id] + mem_size[1] = max(mem_size[1], op.output_shape[0].dims[0] * op.output_shape[0].dims[1]) + mem_size[0] = max(mem_size[0], op.output_shape[0].dims[2] * (op.output_shape[0].dims[3]+3)/4) + + # de-ref input tensor mem + for ipt in op.input: + if ipt in self.ref_counter: + self.ref_counter[ipt] -= 1 + if self.ref_counter[ipt] == 0: + self.idle_mem.add(self.op_mem[ipt]) + elif self.ref_counter[ipt] < 0: + raise Exception('ref count is less than 0') + + for mem in self.mem_block: + arena = self.net_def.mem_arena + block = arena.mem_block.add() + block.mem_id = mem + block.x = self.mem_block[mem][0] + block.y = self.mem_block[mem][1] + + print('total op: %d', len(self.net_def.op)) + origin_mem_size = 0 + optimized_mem_size = 0 + for op in self.net_def.op: + if self.is_buffer_image_op(op): + continue + origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1) + for mem in self.mem_block: + optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4) + + print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size) + def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, device): net_def = mace_pb2.NetDef() dt = data_type_map[data_type] @@ -418,7 +511,8 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, devi ops = graph.get_operations() converter = TFConverter(ops, net_def, dt, device) converter.convert(input_node, output_node) - - print "PB Parsed." + print "PB Converted, start optimize memory." + converter.optimize() + print "Memory optimization done." return net_def diff --git a/tools/validate_gcn.sh b/tools/validate_gcn.sh index 8ac0a1989ea98ef7cc9a55c1f1e7fad0cff06103..275f1bfb827f7f713d8b93c892c5080ba2cbd6b6 100644 --- a/tools/validate_gcn.sh +++ b/tools/validate_gcn.sh @@ -13,7 +13,6 @@ fi TF_MODEL_FILE_PATH=$1 MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH}) MACE_MODEL_NAME='mace_model.pb' -MACE_OPT_MODEL_NAME='mace_opt_model.pb' INPUT_FILE_NAME='model_input' OUTPUT_FILE_NAME='gcn.out' OUTPUT_LIST_FILE='gcn.list' @@ -36,10 +35,6 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \ --output_node=GCN/br_result_2/fcn_br \ --data_type=DT_HALF \ --runtime=gpu -bazel build mace/python/tools:memory_optimizer -bazel-bin/mace/python/tools/memory_optimizer ${MODEL_DIR}/${MACE_MODEL_NAME} \ - ${MODEL_DIR}/${MACE_OPT_MODEL_NAME} - # Step 3: Run model on the phone echo "Step 3: Run model on the phone" @@ -51,7 +46,7 @@ bazel build -c opt --strip always mace/examples:mace_run \ adb shell "mkdir -p ${PHONE_DATA_DIR}" adb shell "mkdir -p ${KERNEL_DIR}" adb push mace/kernels/opencl/cl/* ${KERNEL_DIR} -adb push ${MODEL_DIR}/${MACE_OPT_MODEL_NAME} ${PHONE_DATA_DIR} +adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR} adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} @@ -62,7 +57,7 @@ adb