From 6da30d2272173a7f4da646c9481c3bc57b5cf1c2 Mon Sep 17 00:00:00 2001 From: Liangliang He Date: Tue, 10 Apr 2018 14:44:56 +0800 Subject: [PATCH] Enable python style check --- .gitlab-ci.yml | 8 +- docker/Dockerfile | 3 +- mace/python/tools/binary_codegen.py | 123 +- mace/python/tools/caffe_converter_lib.py | 2122 ++++++++--------- mace/python/tools/convert_util.py | 1 - mace/python/tools/converter.py | 308 ++- mace/python/tools/dsp_ops.py | 122 +- mace/python/tools/encrypt_opencl_codegen.py | 120 +- mace/python/tools/graph_util.py | 9 +- mace/python/tools/memory_optimizer.py | 235 +- mace/python/tools/opencl_codegen.py | 151 +- mace/python/tools/source_converter_lib.py | 338 +-- mace/python/tools/tf_converter_lib.py | 2287 ++++++++++--------- mace/python/tools/tf_dsp_converter_lib.py | 875 +++---- mace/python/tools/tf_ops_stats.py | 298 +-- tools/bazel_adb_run.py | 190 +- tools/falcon_cli.py | 22 +- tools/generate_data.py | 65 +- tools/mace_tools.py | 718 +++--- tools/sh_commands.py | 271 ++- tools/validate.py | 307 ++- tools/wino_conv.py | 308 ++- 22 files changed, 4594 insertions(+), 4287 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5b1eff0e..40dc46fb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,5 +1,6 @@ stages: - cpplint + - pycodestyle - ops_test - ops_benchmark @@ -7,7 +8,12 @@ cpplint: stage: cpplint script: - curl -o cpplint.py https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py - - python cpplint.py --linelength=80 --counting=detailed $(find mace -name *.h -or -name *.cc) + - python cpplint.py --linelength=80 --counting=detailed $(find mace -name "*.h" -or -name "*.cc") + +pycodestyle: + stage: pycodestyle + script: + - pycodestyle $(find -name "*.py") ops_test: stage: ops_test diff --git a/docker/Dockerfile b/docker/Dockerfile index 94c19f12..1dfcc83a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -113,7 +113,8 @@ RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com scipy \ jinja2 \ pyyaml \ - sh + sh \ + pycodestyle # Download tensorflow tools RUN wget http://cnbj1-inner-fds.api.xiaomi.net/mace/tool/transform_graph && \ diff --git a/mace/python/tools/binary_codegen.py b/mace/python/tools/binary_codegen.py index 3be2b086..1bafd5e5 100644 --- a/mace/python/tools/binary_codegen.py +++ b/mace/python/tools/binary_codegen.py @@ -16,74 +16,75 @@ FLAGS = None def generate_cpp_source(): - data_map = {} - for binary_dir in FLAGS.binary_dirs.split(","): - binary_path = os.path.join(binary_dir, FLAGS.binary_file_name) - if not os.path.exists(binary_path): - continue + data_map = {} + for binary_dir in FLAGS.binary_dirs.split(","): + binary_path = os.path.join(binary_dir, FLAGS.binary_file_name) + if not os.path.exists(binary_path): + continue - with open(binary_path, "rb") as f: - binary_array = np.fromfile(f, dtype=np.uint8) + with open(binary_path, "rb") as f: + binary_array = np.fromfile(f, dtype=np.uint8) - print "Generate binary from", binary_path - idx = 0 - size, = struct.unpack("Q", binary_array[idx:idx+8]) - idx += 8 - for _ in xrange(size): - key_size, = struct.unpack("i", binary_array[idx:idx+4]) - idx += 4 - key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size]) - idx += key_size - params_size, = struct.unpack("i", binary_array[idx:idx+4]) - idx += 4 - data_map[key] = [] - count = params_size / 4 - params = struct.unpack(str(count) + "i", binary_array[idx:idx+params_size]) - for i in params: - data_map[key].append(i) - idx += params_size + print "Generate binary from", binary_path + idx = 0 + size, = struct.unpack("Q", binary_array[idx:idx + 8]) + idx += 8 + for _ in xrange(size): + key_size, = struct.unpack("i", binary_array[idx:idx + 4]) + idx += 4 + key, = struct.unpack( + str(key_size) + "s", binary_array[idx:idx + key_size]) + idx += key_size + params_size, = struct.unpack("i", binary_array[idx:idx + 4]) + idx += 4 + data_map[key] = [] + count = params_size / 4 + params = struct.unpack( + str(count) + "i", binary_array[idx:idx + params_size]) + for i in params: + data_map[key].append(i) + idx += params_size + + env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) + return env.get_template('str2vec_maps.cc.jinja2').render( + maps=data_map, + data_type='unsigned int', + variable_name=FLAGS.variable_name) - env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) - return env.get_template('str2vec_maps.cc.jinja2').render( - maps = data_map, - data_type = 'unsigned int', - variable_name = FLAGS.variable_name - ) def main(unused_args): - cpp_binary_source = generate_cpp_source() - if os.path.isfile(FLAGS.output_path): - os.remove(FLAGS.output_path) - w_file = open(FLAGS.output_path, "w") - w_file.write(cpp_binary_source) - w_file.close() + cpp_binary_source = generate_cpp_source() + if os.path.isfile(FLAGS.output_path): + os.remove(FLAGS.output_path) + w_file = open(FLAGS.output_path, "w") + w_file.write(cpp_binary_source) + w_file.close() + def parse_args(): - """Parses command line arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument( - "--binary_dirs", - type=str, - default="", - help="The binaries file path.") - parser.add_argument( - "--binary_file_name", - type=str, - default="mace_run.config", - help="The binary file name.") - parser.add_argument( - "--output_path", - type=str, - default="", - help="The path of generated C++ source file which contains the binary.") - parser.add_argument( - "--variable_name", - type=str, - default="kTuningParamsData", - help="global variable name.") - return parser.parse_known_args() + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--binary_dirs", type=str, default="", help="The binaries file path.") + parser.add_argument( + "--binary_file_name", + type=str, + default="mace_run.config", + help="The binary file name.") + parser.add_argument( + "--output_path", + type=str, + default="", + help="The path of generated C++ source file which contains the binary." + ) + parser.add_argument( + "--variable_name", + type=str, + default="kTuningParamsData", + help="global variable name.") + return parser.parse_known_args() if __name__ == '__main__': - FLAGS, unparsed = parse_args() - main(unused_args=[sys.argv[0]] + unparsed) + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py index 7a94d280..7a9c0333 100644 --- a/mace/python/tools/caffe_converter_lib.py +++ b/mace/python/tools/caffe_converter_lib.py @@ -5,32 +5,26 @@ import google.protobuf.text_format import numpy as np import math -pooling_type_mode = { - 'AvgPool': 1, - 'MaxPool': 2 -} +pooling_type_mode = {'AvgPool': 1, 'MaxPool': 2} buffer_type_map = { - 'CONV2D_FILTER' : 0, - 'IN_OUT_CHANNEL' : 1, - 'ARGUMENT' : 2, - 'IN_OUT_HEIGHT' : 3, - 'IN_OUT_WIDTH' : 4, - 'WINOGRAD_FILTER' : 5, - 'DW_CONV2D_FILTER' : 6, - 'WEIGHT_HEIGHT' : 7, - 'WEIGHT_WIDTH' : 8, + 'CONV2D_FILTER': 0, + 'IN_OUT_CHANNEL': 1, + 'ARGUMENT': 2, + 'IN_OUT_HEIGHT': 3, + 'IN_OUT_WIDTH': 4, + 'WINOGRAD_FILTER': 5, + 'DW_CONV2D_FILTER': 6, + 'WEIGHT_HEIGHT': 7, + 'WEIGHT_WIDTH': 8, } -data_type_map = { - 'DT_HALF' : mace_pb2.DT_HALF, - 'DT_FLOAT': mace_pb2.DT_FLOAT -} +data_type_map = {'DT_HALF': mace_pb2.DT_HALF, 'DT_FLOAT': mace_pb2.DT_FLOAT} activation_name_map = { - 'ReLU' : 'RELU', - 'Sigmoid' : 'SIGMOID', - 'TanH' : 'TANH', + 'ReLU': 'RELU', + 'Sigmoid': 'SIGMOID', + 'TanH': 'TANH', } MACE_INPUT_NODE_NAME = "mace_input_node" @@ -38,1022 +32,1102 @@ MACE_OUTPUT_NODE_NAME = "mace_output_node" OPENCL_IMAGE_MAX_SIZE = 16384 + class Operator(object): - def __init__(self, name, type, layer): - self.name = name - self.type = type - self.layer = layer - self.parents = [] - self.children = [] - self.data = [] - self.output_shape_map = {} - - def add_parent(self, parent_op): - self.parents.append(parent_op) - parent_op.children.append(self) - - def get_single_parent(self): - if len(self.parents) != 1: - raise Exception('Operation %s expected single parent, but got %s' - % (self.name, len(self.parents))) - return self.parents[0] + def __init__(self, name, type, layer): + self.name = name + self.type = type + self.layer = layer + self.parents = [] + self.children = [] + self.data = [] + self.output_shape_map = {} + + def add_parent(self, parent_op): + self.parents.append(parent_op) + parent_op.children.append(self) + + def get_single_parent(self): + if len(self.parents) != 1: + raise Exception('Operation %s expected single parent, but got %s' % + (self.name, len(self.parents))) + return self.parents[0] + def BlobToNPArray(blob): - if blob.num != 0: - return (np.asarray(blob.data, dtype=np.float32). - reshape((blob.num, blob.channels, blob.height, blob.width))) - else: - return np.asarray(blob.data, dtype=np.float32).reshape(blob.shape.dim) + if blob.num != 0: + return (np.asarray(blob.data, dtype=np.float32).reshape( + (blob.num, blob.channels, blob.height, blob.width))) + else: + return np.asarray(blob.data, dtype=np.float32).reshape(blob.shape.dim) class Shapes(object): - @staticmethod - def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func, input_format='NHWC'): - output_shape = np.zeros_like(input_shape) - output_shape[0] = input_shape[0] - if input_format == 'NHWC': - # input format: NHWC, filter format: HWOI - output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0] - - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1 - output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1] - - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1 - output_shape[3] = filter_shape[2] - elif input_format == 'NCHW': - # input format: NCHW, filter format: OIHW - output_shape[1] = filter_shape[0] - output_shape[2] = int(round_func((input_shape[2] + paddings[0] - filter_shape[2] - - (filter_shape[2] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1 - output_shape[3] = int(round_func((input_shape[3] + paddings[1] - filter_shape[3] - - (filter_shape[3] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1 - else: - raise Exception("format %s is not supported" % input_format) - - return output_shape - - @staticmethod - def fully_connected_shape(input_shape, weight_shape): - return [input_shape[0], 1, 1, weight_shape[0]] - - @staticmethod - def concat_shape(input_shapes, axis): - output_shape = None - for input_shape in input_shapes: - if output_shape is None: - output_shape = list(input_shape) - else: - output_shape[axis] += input_shape[axis] - return output_shape - - @staticmethod - def slice_shape(input_shape, num_output, input_format='NHWC'): - if input_format == 'NHWC': - return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output] - elif input_format == 'NCHW': - return [input_shape[0], input_shape[1]/num_output, input_shape[2], input_shape[3]] - else: - raise Exception("format %s is not supported" % input_format) + @staticmethod + def conv_pool_shape(input_shape, + filter_shape, + paddings, + strides, + dilations, + round_func, + input_format='NHWC'): + output_shape = np.zeros_like(input_shape) + output_shape[0] = input_shape[0] + if input_format == 'NHWC': + # input format: NHWC, filter format: HWOI + output_shape[1] = int( + round_func((input_shape[1] + paddings[0] - filter_shape[0] - + (filter_shape[0] - 1) * + (dilations[0] - 1)) / float(strides[0]))) + 1 + output_shape[2] = int( + round_func((input_shape[2] + paddings[1] - filter_shape[1] - + (filter_shape[1] - 1) * + (dilations[1] - 1)) / float(strides[1]))) + 1 + output_shape[3] = filter_shape[2] + elif input_format == 'NCHW': + # input format: NCHW, filter format: OIHW + output_shape[1] = filter_shape[0] + output_shape[2] = int( + round_func((input_shape[2] + paddings[0] - filter_shape[2] - + (filter_shape[2] - 1) * + (dilations[0] - 1)) / float(strides[0]))) + 1 + output_shape[3] = int( + round_func((input_shape[3] + paddings[1] - filter_shape[3] - + (filter_shape[3] - 1) * + (dilations[1] - 1)) / float(strides[1]))) + 1 + else: + raise Exception("format %s is not supported" % input_format) + + return output_shape + + @staticmethod + def fully_connected_shape(input_shape, weight_shape): + return [input_shape[0], 1, 1, weight_shape[0]] + + @staticmethod + def concat_shape(input_shapes, axis): + output_shape = None + for input_shape in input_shapes: + if output_shape is None: + output_shape = list(input_shape) + else: + output_shape[axis] += input_shape[axis] + return output_shape + + @staticmethod + def slice_shape(input_shape, num_output, input_format='NHWC'): + if input_format == 'NHWC': + return [ + input_shape[0], input_shape[1], input_shape[2], + input_shape[3] / num_output + ] + elif input_format == 'NCHW': + return [ + input_shape[0], input_shape[1] / num_output, input_shape[2], + input_shape[3] + ] + else: + raise Exception("format %s is not supported" % input_format) + # outputs' name is [op.name + '_' + #] class CaffeConverter(object): - def __init__(self, caffe_net, weights, net_def, dt, device, winograd): - self.net_def = net_def - self.caffe_net = caffe_net - self.weights = weights - self.dt = dt - self.device = device - self.winograd = winograd - self.resolved_ops = set() - self.ops = [] - self.inputs_map = {} # caffe op name -> mace inputs' name - - # Add Input operations - top_name_map = {} - inputs = caffe_net.input - for input in inputs: - self.ops.extend([Operator(input, 'Input', None)]) - top_name_map[input] = input - - layers = caffe_net.layer - # remove train layers and dropout - layers = self.remove_unused_layers(layers) - - # Construct graph - # Only support single-output layer - # layer with single output often use the same top name. - self.ops.extend([Operator(layer.name, layer.type, layer) for layer in layers]) - - self.ops_map = {op.name : op for op in self.ops} - output_op_map = {} - for layer in layers: - op = self.ops_map[layer.name] - for input_name in layer.bottom: - assert input_name != layer.name - parent_op = output_op_map.get(input_name) - if parent_op is None: - parent_op = self.ops_map[input_name] - op.add_parent(parent_op) - if op.name not in self.inputs_map: - self.inputs_map[op.name] = [] - self.inputs_map[op.name].extend([top_name_map[input_name]]) - for i in range(len(layer.top)): - output_name = layer.top[i] - if len(layer.top) == 1: - top_name_map[output_name] = op.name + def __init__(self, caffe_net, weights, net_def, dt, device, winograd): + self.net_def = net_def + self.caffe_net = caffe_net + self.weights = weights + self.dt = dt + self.device = device + self.winograd = winograd + self.resolved_ops = set() + self.ops = [] + self.inputs_map = {} # caffe op name -> mace inputs' name + + # Add Input operations + top_name_map = {} + inputs = caffe_net.input + for input in inputs: + self.ops.extend([Operator(input, 'Input', None)]) + top_name_map[input] = input + + layers = caffe_net.layer + # remove train layers and dropout + layers = self.remove_unused_layers(layers) + + # Construct graph + # Only support single-output layer + # layer with single output often use the same top name. + self.ops.extend( + [Operator(layer.name, layer.type, layer) for layer in layers]) + + self.ops_map = {op.name: op for op in self.ops} + output_op_map = {} + for layer in layers: + op = self.ops_map[layer.name] + for input_name in layer.bottom: + assert input_name != layer.name + parent_op = output_op_map.get(input_name) + if parent_op is None: + parent_op = self.ops_map[input_name] + op.add_parent(parent_op) + if op.name not in self.inputs_map: + self.inputs_map[op.name] = [] + self.inputs_map[op.name].extend([top_name_map[input_name]]) + for i in range(len(layer.top)): + output_name = layer.top[i] + if len(layer.top) == 1: + top_name_map[output_name] = op.name + else: + top_name_map[output_name] = op.name + '_' + str(i) + if output_name == layer.name: + continue + output_op_map[output_name] = op + + # Load weights + weights_layers = weights.layer + for layer in weights_layers: + if not layer.blobs: + continue + if layer.name in self.ops_map: + op = self.ops_map[layer.name] + op.data = [BlobToNPArray(blob) for blob in layer.blobs] + + # toposort ops + self.ops = self.toposort_ops() + + def CommonConvert(self, op, mace_type): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + if self.device == 'neon': + data_format_arg.s = 'NCHW' else: - top_name_map[output_name] = op.name + '_' + str(i) - if output_name == layer.name: - continue - output_op_map[output_name] = op - - - # Load weights - weights_layers = weights.layer - for layer in weights_layers: - if not layer.blobs: - continue - if layer.name in self.ops_map: - op = self.ops_map[layer.name] - op.data = [BlobToNPArray(blob) for blob in layer.blobs] - - # toposort ops - self.ops = self.toposort_ops() - - def CommonConvert(self, op, mace_type): - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'neon': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - op_def.name = op.name - op_def.type = mace_type - op_def.input.extend([name+':0' for name in self.inputs_map[op.name]]) - return op_def - - def remove_unused_layers(self, layers): - phase_map = {0: 'train', 1: 'test'} - test_layers_names = set() - test_layers = [] - for layer in layers: - phase = 'test' - if len(layer.include): - phase = phase_map[layer.include[0].phase] - if len(layer.exclude): - phase = phase_map[layer.exclude[0].phase] - if phase == 'test' and layer.type != 'Dropout': - test_layers.append(layer) - assert layer.name not in test_layers_names - test_layers_names.add(layer.name) - return test_layers - - def toposort_ops(self): - sorted_ops = [] - temp_visited = set() - visited = set() - - def search(op): - if op.name in temp_visited: - raise Exception("The model is not DAG") - if op.name in visited: - return - temp_visited.add(op.name) - for parent_op in op.parents: - search(parent_op) - temp_visited.remove(op.name) - sorted_ops.append(op) - visited.add(op.name) - - for op in self.ops: - search(op) - - return sorted_ops - - def add_buffer_to_image(self, input_name, input_type): - output_name = input_name[:-2] + "_b2i" + input_name[-2:] - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'BufferToImage' - op_def.input.extend([input_name]) - op_def.output.extend([output_name]) - - arg = op_def.arg.add() - arg.name = 'buffer_type' - arg.i = buffer_type_map[input_type] - arg = op_def.arg.add() - arg.name = 'mode' - arg.i = 0 - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - return output_name - - def add_image_to_buffer(self, input_name, input_type): - output_name = input_name[:-2] + "_i2b" + input_name[-2:] - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'ImageToBuffer' - op_def.input.extend([input_name]) - op_def.output.extend([output_name]) - - arg = op_def.arg.add() - arg.name = 'buffer_type' - arg.i = buffer_type_map[input_type] - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - return output_name - - def add_input_transform(self, names): - for name in names: - new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = name - op_def.type = 'BufferToImage' - op_def.input.extend([new_input_name]) - op_def.output.extend([name+':0']) - - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'buffer_type' - epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] - - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - - def add_output_transform(self, names): - for name in names: - output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'ImageToBuffer' - op_def.input.extend([name+':0']) - op_def.output.extend([output_name]) - - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'buffer_type' - epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] - - def add_tensor(self, name, value): - tensor = self.net_def.tensors.add() - tensor.name = name - - shape = list(value.shape) - tensor.dims.extend(shape) - - tensor.data_type = mace_pb2.DT_FLOAT - tensor.float_data.extend(value.flat) - - @staticmethod - def add_output_shape(op_def, output_shape): - mace_output_shape = mace_pb2.OutputShape() - mace_output_shape.dims.extend(output_shape) - op_def.output_shape.extend([mace_output_shape]) - - def add_stride_pad_kernel_arg(self, param, op_def): - try: - if len(param.stride) > 1 or len(param.kernel_size) > 1 or len(param.pad) > 1: - raise Exception('Mace does not support multiple stride/kernel_size/pad') - stride = [param.stride[0], param.stride[0]] if len(param.stride) else [1, 1] - pad = [param.pad[0] * 2, param.pad[0] * 2] if len(param.pad) else [0, 0] - kernel = [param.kernel_size[0], param.kernel_size[0]] if len(param.kernel_size) else [0, 0] - except TypeError: - stride = [param.stride, param.stride] - pad = [param.pad * 2, param.pad * 2] - kernel = [param.kernel_size, param.kernel_size] - - if param.HasField("stride_h") or param.HasField("stride_w"): - stride = [param.stride_h, param.stride_w] - # Pad - if param.HasField("pad_h") or param.HasField("pad_w"): - pad = [param.pad_h * 2, param.pad_w * 2] - - if op_def is not None: - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend(stride) - - padding_arg = op_def.arg.add() - padding_arg.name = 'padding_values' - padding_arg.ints.extend(pad) - - if op_def.type == 'Pooling': - if param.HasField("kernel_h") or param.HasField("kernel_w"): - kernel = [param.kernel_h, param.kernel_w] - - return pad, stride, kernel - - def convert_conv2d(self, op): - param = op.layer.convolution_param - is_depthwise = False - if param.HasField('group'): - if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1: - is_depthwise = True - else: - raise Exception("Mace do not support group convolution yet") - - if is_depthwise: - op_def = self.CommonConvert(op, 'DepthwiseConv2d') - else: - op_def = self.CommonConvert(op, 'Conv2D') + data_format_arg.s = 'NHWC' + op_def.name = op.name + op_def.type = mace_type + op_def.input.extend([name + ':0' for name in self.inputs_map[op.name]]) + return op_def + + def remove_unused_layers(self, layers): + phase_map = {0: 'train', 1: 'test'} + test_layers_names = set() + test_layers = [] + for layer in layers: + phase = 'test' + if len(layer.include): + phase = phase_map[layer.include[0].phase] + if len(layer.exclude): + phase = phase_map[layer.exclude[0].phase] + if phase == 'test' and layer.type != 'Dropout': + test_layers.append(layer) + assert layer.name not in test_layers_names + test_layers_names.add(layer.name) + return test_layers + + def toposort_ops(self): + sorted_ops = [] + temp_visited = set() + visited = set() + + def search(op): + if op.name in temp_visited: + raise Exception("The model is not DAG") + if op.name in visited: + return + temp_visited.add(op.name) + for parent_op in op.parents: + search(parent_op) + temp_visited.remove(op.name) + sorted_ops.append(op) + visited.add(op.name) + + for op in self.ops: + search(op) + + return sorted_ops + + def add_buffer_to_image(self, input_name, input_type): + output_name = input_name[:-2] + "_b2i" + input_name[-2:] + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'BufferToImage' + op_def.input.extend([input_name]) + op_def.output.extend([output_name]) + + arg = op_def.arg.add() + arg.name = 'buffer_type' + arg.i = buffer_type_map[input_type] + arg = op_def.arg.add() + arg.name = 'mode' + arg.i = 0 + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + return output_name + + def add_image_to_buffer(self, input_name, input_type): + output_name = input_name[:-2] + "_i2b" + input_name[-2:] + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'ImageToBuffer' + op_def.input.extend([input_name]) + op_def.output.extend([output_name]) + + arg = op_def.arg.add() + arg.name = 'buffer_type' + arg.i = buffer_type_map[input_type] + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + return output_name + + def add_input_transform(self, names): + for name in names: + new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = name + op_def.type = 'BufferToImage' + op_def.input.extend([new_input_name]) + op_def.output.extend([name + ':0']) + + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'buffer_type' + epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] + + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + def add_output_transform(self, names): + for name in names: + output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'ImageToBuffer' + op_def.input.extend([name + ':0']) + op_def.output.extend([output_name]) + + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'buffer_type' + epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] + + def add_tensor(self, name, value): + tensor = self.net_def.tensors.add() + tensor.name = name + + shape = list(value.shape) + tensor.dims.extend(shape) + + tensor.data_type = mace_pb2.DT_FLOAT + tensor.float_data.extend(value.flat) + + @staticmethod + def add_output_shape(op_def, output_shape): + mace_output_shape = mace_pb2.OutputShape() + mace_output_shape.dims.extend(output_shape) + op_def.output_shape.extend([mace_output_shape]) + + def add_stride_pad_kernel_arg(self, param, op_def): + try: + if len(param.stride) > 1 or len(param.kernel_size) > 1 or len( + param.pad) > 1: + raise Exception( + 'Mace does not support multiple stride/kernel_size/pad') + stride = [param.stride[0], + param.stride[0]] if len(param.stride) else [1, 1] + pad = [param.pad[0] * 2, + param.pad[0] * 2] if len(param.pad) else [0, 0] + kernel = [param.kernel_size[0], param.kernel_size[0]] if len( + param.kernel_size) else [0, 0] + except TypeError: + stride = [param.stride, param.stride] + pad = [param.pad * 2, param.pad * 2] + kernel = [param.kernel_size, param.kernel_size] + + if param.HasField("stride_h") or param.HasField("stride_w"): + stride = [param.stride_h, param.stride_w] + # Pad + if param.HasField("pad_h") or param.HasField("pad_w"): + pad = [param.pad_h * 2, param.pad_w * 2] + + if op_def is not None: + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend(stride) + + padding_arg = op_def.arg.add() + padding_arg.name = 'padding_values' + padding_arg.ints.extend(pad) + + if op_def.type == 'Pooling': + if param.HasField("kernel_h") or param.HasField("kernel_w"): + kernel = [param.kernel_h, param.kernel_w] + + return pad, stride, kernel + + def convert_conv2d(self, op): + param = op.layer.convolution_param + is_depthwise = False + if param.HasField('group'): + if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1: + is_depthwise = True + else: + raise Exception("Mace do not support group convolution yet") + + if is_depthwise: + op_def = self.CommonConvert(op, 'DepthwiseConv2d') + else: + op_def = self.CommonConvert(op, 'Conv2D') - # Add filter - weight_tensor_name = op.name + '_weight:0' - if self.device == 'neon': - weight_data = op.data[0] - else: - # OIHW -> HWOI - weight_data = op.data[0].transpose((2, 3, 0, 1)) - self.add_tensor(weight_tensor_name, weight_data) - - if self.device == 'gpu': - buffer_type = "DW_CONV2D_FILTER" if is_depthwise else "CONV2D_FILTER" - output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type) - op_def.input.extend([output_name]) - else: - op_def.input.extend([weight_tensor_name]) - - # Add Bias - if len(op.data) == 2: - bias_tensor_name = op.name + '_bias:0' - bias_data = op.data[1].reshape(-1) - self.add_tensor(bias_tensor_name, bias_data) - if self.device == 'gpu': - output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([bias_tensor_name]) - - paddings, strides, _ = self.add_stride_pad_kernel_arg(param, op_def) - dilations = [1, 1] - if len(param.dilation) > 0: - dilation_arg = op_def.arg.add() - dilation_arg.name = 'dilations' - if len(param.dilation) == 1: - dilations = [param.dilation[0], param.dilation[0]] - elif len(param.dilation) == 2: - dilations = [param.dilation[0], param.dilation[1]] - dilation_arg.ints.extend(dilations) - final_op = op - self.resolved_ops.add(op.name) - - input_format = 'NCHW' if self.device == 'neon' else 'NHWC' - output_shape = Shapes.conv_pool_shape(op.get_single_parent().output_shape_map[op.layer.bottom[0]], - weight_data.shape, - paddings, strides, dilations, - math.floor, input_format) - op.output_shape_map[op.layer.top[0]] = output_shape - - if len(self.ops_map[final_op.name].children) == 1 \ - and self.ops_map[final_op.name].children[0].type in activation_name_map: - activation_op = self.ops_map[final_op.name].children[0] - if not is_depthwise: - op_def.type = "FusedConv2D" - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - final_op = activation_op - final_op.output_shape_map[final_op.layer.top[0]] = output_shape - self.resolved_ops.add(activation_op.name) - - op_def.output.extend([final_op.name+':0']) - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - - def check_winograd_conv(self, op): - # TODO: support winograd conv on neon - if self.device == 'neon': - return False - param = op.layer.convolution_param - filter_shape = np.asarray(op.data[0].shape) - if self.device != 'neon': - filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI - paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) - - dilations = [1, 1] - if len(param.dilation) > 0: - if len(param.dilation) == 1: - dilations = [param.dilation[0], param.dilation[0]] - elif len(param.dilation) == 2: - dilations = [param.dilation[0], param.dilation[1]] - - input_format = 'NCHW' if self.device == 'neon' else 'NHWC' - output_shape = Shapes.conv_pool_shape( - op.get_single_parent().output_shape_map[op.layer.bottom[0]], - filter_shape, paddings, strides, dilations, math.floor, input_format) - width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) - if self.winograd and dilations[0] == 1 and (dilations[0] == dilations[1]) and \ - (strides[0] == 1) and (strides[0] == strides[1]): - if self.device == 'gpu': - return filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \ - (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ - (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ - (width < OPENCL_IMAGE_MAX_SIZE) - elif self.device == 'neon': - return filter_shape[2] == 3 and (filter_shape[2] == filter_shape[3]) - return False - - def convert_winograd_conv(self, op): - # Add filter - weight_tensor_name = op.name + '_weight:0' - self.add_tensor(weight_tensor_name, op.data[0]) - - buffer_type = "WINOGRAD_FILTER" - filter_name = self.add_buffer_to_image(weight_tensor_name, buffer_type) - - param = op.layer.convolution_param - paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) - - filter_shape = np.asarray(op.data[0].shape) - if self.device != 'neon': - filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI - - input_format = 'NCHW' if self.device == 'neon' else 'NHWC' - output_shape = Shapes.conv_pool_shape( - op.get_single_parent().output_shape_map[op.layer.bottom[0]], - filter_shape, paddings, strides, [1, 1], math.floor, input_format) - - # Input transform - wt_op = mace_pb2.OperatorDef() - arg = wt_op.arg.add() - arg.name = 'T' - arg.i = self.dt - padding_arg = wt_op.arg.add() - padding_arg.name = 'padding_values' - padding_arg.ints.extend(paddings) - wt_op.name = op.name + '_input_transform' - wt_op.type = 'WinogradTransform' - wt_op.input.extend([name+':0' for name in self.inputs_map[op.name]]) - wt_output_name = wt_op.name + ":0" - wt_op.output.extend([wt_output_name]) - wt_output_shape = mace_pb2.OutputShape() - if self.device != 'neon': - wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) - wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1]) - else: - wt_output_width = output_shape[0] * ((output_shape[2] + 1)/2) * ((output_shape[3]+1)/2) - wt_output_shape.dims.extend([16, filter_shape[1], wt_output_width, 1]) - wt_op.output_shape.extend([wt_output_shape]) - - # MatMul - matmul_op = mace_pb2.OperatorDef() - arg = matmul_op.arg.add() - arg.name = 'T' - arg.i = self.dt - matmul_op.name = op.name + '_matmul' - matmul_op.type = 'MatMul' - matmul_op.input.extend([filter_name, wt_output_name]) - matmul_output_name = matmul_op.name + ":0" - matmul_op.output.extend([matmul_output_name]) - matmul_output_shape = mace_pb2.OutputShape() - if self.device != 'neon': - matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1]) - else: - matmul_output_shape.dims.extend([16, filter_shape[0], wt_output_width, 1]) - matmul_op.output_shape.extend([matmul_output_shape]) - - # Inverse transform - iwt_op = mace_pb2.OperatorDef() - arg = iwt_op.arg.add() - arg.name = 'T' - arg.i = self.dt - batch_arg = iwt_op.arg.add() - batch_arg.name = 'batch' - batch_arg.i = output_shape[0] - height_arg = iwt_op.arg.add() - height_arg.name = 'height' - height_arg.i = output_shape[1] if self.device != 'neon' else output_shape[2] - width_arg = iwt_op.arg.add() - width_arg.name = 'width' - width_arg.i = output_shape[2] if self.device != 'neon' else output_shape[3] - iwt_op.name = op.name + '_inverse_transform' - iwt_op.type = 'WinogradInverseTransform' - iwt_op.input.extend([matmul_output_name]) - - # Add Bias - if len(op.data) == 2: - bias_tensor_name = op.name + '_bias:0' - bias_data = op.data[1].reshape(-1) - self.add_tensor(bias_tensor_name, bias_data) - output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT") - iwt_op.input.extend([output_name]) - - final_op = op - final_op.output_shape_map[final_op.layer.top[0]] = output_shape - self.resolved_ops.add(op.name) - - if len(self.ops_map[final_op.name].children) == 1 \ - and self.ops_map[final_op.name].children[0].type in activation_name_map: - activation_op = self.ops_map[final_op.name].children[0] - fused_act_arg = iwt_op.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - final_op = activation_op - final_op.output_shape_map[final_op.layer.top[0]] = output_shape - self.resolved_ops.add(activation_op.name) - - iwt_op.output.extend([final_op.name+':0']) - self.add_output_shape(iwt_op, output_shape) - self.net_def.op.extend([wt_op, matmul_op, iwt_op]) - - def convert_batchnorm(self, op): - if len(op.children) != 1 or op.children[0].type != 'Scale': - raise Exception('Now only support BatchNorm+Scale') - op_def = self.CommonConvert(op, 'FoldedBatchNorm') - scale_op = op.children[0] - - epsilon_value = op.layer.batch_norm_param.eps - if op.data[2][0] != 0: - mean_value = (1. / op.data[2][0]) * op.data[0] - var_value = (1. / op.data[2][0]) * op.data[1] - else: - raise RuntimeError('scalar is zero.') - - gamma_value = scale_op.data[0] - beta_value = np.zeros_like(mean_value) - if len(scale_op.data) == 2: - beta_value = scale_op.data[1] - - scale_value = ( - (1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) * - gamma_value).reshape(-1) - offset_value = ((-mean_value * scale_value) + beta_value).reshape(-1) - input_names = [op.name+'_scale:0', op.name+'_offset:0'] - self.add_tensor(input_names[0], scale_value) - self.add_tensor(input_names[1], offset_value) - - if self.device == 'gpu': - for name in input_names: - output_name = self.add_buffer_to_image(name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([name for name in input_names]) - - self.resolved_ops.add(op.name) - self.resolved_ops.add(scale_op.name) - final_op = scale_op - - output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]] - - if len(self.ops_map[final_op.name].children) == 1 \ - and self.ops_map[final_op.name].children[0].type in activation_name_map: - activation_op = self.ops_map[final_op.name].children[0] - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - final_op = activation_op - final_op.output_shape_map[final_op.layer.top[0]] = output_shape - self.resolved_ops.add(activation_op.name) - - op_def.output.extend([final_op.name + ':0']) - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - - def convert_inner_product(self, op): - param = op.layer.inner_product_param - try: - if param.axis != 1 or param.transpose: - raise ValueError('Do not support non-default axis and transpose ' - 'case for innner product') - except AttributeError: - pass - - op_def = self.CommonConvert(op, 'FC') - weight_tensor_name = op.name + '_weight:0' - if op.data[0].ndim not in [2, 4]: - raise ValueError('Unexpected weigth ndim.') - if op.data[0].ndim == 4 and list(op.data[0].shape[:2]) != [1, 1]: - raise ValueError('Do not support 4D weight with shape [1, 1, *, *]') - input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]] - - weight_data = op.data[0].reshape(-1, op.data[0].shape[-1]) - assert weight_data.shape[1] == (input_shape[1] * input_shape[2] * input_shape[3]) - if self.device != 'neon': - weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2]) - weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1) - self.add_tensor(weight_tensor_name, weight_data) - if self.device == 'gpu': - if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE \ - and (weight_data.shape[1] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE: - raise Exception('Mace gpu do not support FC with weight shape: ' - +str(weight_data.shape)) - if input_shape[3] % 4 == 0: - buffer_type = "WEIGHT_WIDTH" - else: - buffer_type = "WEIGHT_HEIGHT" - weight_type_arg = op_def.arg.add() - weight_type_arg.name = 'weight_type' - weight_type_arg.i = buffer_type_map['WEIGHT_HEIGHT'] - - if buffer_type == "WEIGHT_HEIGHT" and \ - (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE: - raise Exception('Mace gpu do not support FC with weight shape: ' - +str(weight_data.shape)) - output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type) - op_def.input.extend([output_name]) - else: - op_def.input.extend([weight_tensor_name]) - - # Add Bias - if len(op.data) == 2: - bias_tensor_name = op.name + '_bias:0' - bias_data = op.data[1].reshape(-1) - self.add_tensor(bias_tensor_name, bias_data) - if self.device == 'gpu': - output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([bias_tensor_name]) - - self.resolved_ops.add(op.name) - output_shape = Shapes.fully_connected_shape(input_shape, weight_data.shape) - op.output_shape_map[op.layer.top[0]] = output_shape - final_op = op - - if len(self.ops_map[final_op.name].children) == 1 \ - and self.ops_map[final_op.name].children[0].type in activation_name_map: - activation_op = self.ops_map[final_op.name].children[0] - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - final_op = activation_op - final_op.output_shape_map[final_op.layer.top[0]] = output_shape - self.resolved_ops.add(activation_op.name) - - op_def.output.extend([final_op.name + ':0']) - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - - def convert_pooling(self, op): - op_def = self.CommonConvert(op, 'Pooling') - - param = op.layer.pooling_param - paddings, strides, kernels = self.add_stride_pad_kernel_arg(param, op_def) - if param.pool == caffe_pb2.PoolingParameter.MAX: - pooling_type = "MaxPool" - elif param.pool == caffe_pb2.PoolingParameter.AVE: - pooling_type = "AvgPool" - pooling_type_arg = op_def.arg.add() - pooling_type_arg.name = 'pooling_type' - pooling_type_arg.i = pooling_type_mode[pooling_type] - - input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]] - if param.HasField('global_pooling') and param.global_pooling: - kernels = [input_shape[1], input_shape[2]] - - kernel_arg = op_def.arg.add() - kernel_arg.name = 'kernels' - kernel_arg.ints.extend(kernels) - - filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]] \ - if self.device != 'neon' else \ - [input_shape[1], input_shape[1], kernels[0], kernels[1]] - input_format = 'NCHW' if self.device == 'neon' else 'NHWC' - output_shape = Shapes.conv_pool_shape(input_shape, filter_shape, - paddings, strides, [1, 1], math.ceil, input_format) - op.output_shape_map[op.layer.top[0]] = output_shape - - op_def.output.extend([op.name + ':0']) - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_activation(self, op): - op_def = self.CommonConvert(op, 'Activation') - activation_arg = op_def.arg.add() - activation_arg.name = 'activation' - activation_arg.s = activation_name_map[op.type] - op_def.output.extend([op.name + ':0']) - output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_prelu(self, op): - op_def = self.CommonConvert(op, 'Activation') - activation_arg = op_def.arg.add() - activation_arg.name = 'activation' - activation_arg.s = 'PRELU' - alpha_tensor_name = op.name + '_alpha:0' - alpha_data = op.data[0].reshape(-1) - self.add_tensor(alpha_tensor_name, alpha_data) - if self.device == 'gpu': - output_name = self.add_buffer_to_image(alpha_tensor_name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([alpha_tensor_name]) - op_def.output.extend([op.name + ':0']) - output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_add(self, op): - op_def = self.CommonConvert(op, 'AddN') - op_def.output.extend([op.name + ':0']) - output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_concat(self, op): - op_def = self.CommonConvert(op, 'Concat') - axis_arg = op_def.arg.add() - axis_arg.name = 'axis' - axis_arg.i = 3 if self.device != 'neon' else 1 - try: - if op.layer.concat_param.HasFeild('axis'): - axis_arg.i = op.concat_param.axis - elif op.layer.concat_param.HasFeild('concat_dim'): - axis_arg.i = op.concat_param.concat_dim - except AttributeError: - pass + # Add filter + weight_tensor_name = op.name + '_weight:0' + if self.device == 'neon': + weight_data = op.data[0] + else: + # OIHW -> HWOI + weight_data = op.data[0].transpose((2, 3, 0, 1)) + self.add_tensor(weight_tensor_name, weight_data) + + if self.device == 'gpu': + buffer_type = "DW_CONV2D_FILTER" \ + if is_depthwise else "CONV2D_FILTER" + output_name = self.add_buffer_to_image(weight_tensor_name, + buffer_type) + op_def.input.extend([output_name]) + else: + op_def.input.extend([weight_tensor_name]) + + # Add Bias + if len(op.data) == 2: + bias_tensor_name = op.name + '_bias:0' + bias_data = op.data[1].reshape(-1) + self.add_tensor(bias_tensor_name, bias_data) + if self.device == 'gpu': + output_name = self.add_buffer_to_image(bias_tensor_name, + "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([bias_tensor_name]) + + paddings, strides, _ = self.add_stride_pad_kernel_arg(param, op_def) + dilations = [1, 1] + if len(param.dilation) > 0: + dilation_arg = op_def.arg.add() + dilation_arg.name = 'dilations' + if len(param.dilation) == 1: + dilations = [param.dilation[0], param.dilation[0]] + elif len(param.dilation) == 2: + dilations = [param.dilation[0], param.dilation[1]] + dilation_arg.ints.extend(dilations) + final_op = op + self.resolved_ops.add(op.name) - input_shapes = [] - for i in range(len(op.parents)): - input_shapes.append(op.parents[i].output_shape_map[op.layer.bottom[i]]) - output_shape = Shapes.concat_shape(input_shapes, axis_arg.i) - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_eltwise(self, op): - op_def = self.CommonConvert(op, 'Eltwise') - param = op.layer.eltwise_param - type_arg = op_def.arg.add() - type_arg.name = 'type' - type_arg.i = param.operation - if len(param.coeff) > 0: - coeff_arg = op_def.arg.add() - coeff_arg.name = 'coeff' - coeff_arg.ints.extend(list(param.coeff)) - - output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_slice(self, op): - op_def = self.CommonConvert(op, 'Slice') - if op.layer.HasField('slice_param'): - param = op.layer.slice_param - if param.HasField('axis') and param.axis != 1: - raise Exception('Mace do not support slice with axis ' + str(param.axis)) - if len(param.slice_point) > 0: - raise Exception('Mace do not support slice with slice_point') - - axis_arg = op_def.arg.add() - axis_arg.name = 'axis' - axis_arg.i = 3 if self.device != 'neon' else 1 - - input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - num_outputs = len(op.layer.top) - input_channels = input_shape[axis_arg.i] - if (input_channels % num_outputs) != 0 or \ - (self.device == 'gpu' and ((input_channels / num_outputs) % 4 != 0)): - raise Exception('Mace do not support slice with input shape ' - + str(input_shape) + ' and number of output ' + str(num_outputs)) - input_format = 'NCHW' if self.device == 'neon' else 'NHWC' - output_shape = Shapes.slice_shape(input_shape, num_outputs, input_format) - for i in range(len(op.layer.top)): - op.output_shape_map[op.layer.top[i]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + '_' + str(i) + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_normal_op(self, op): - op_def = self.CommonConvert(op, op.type) - output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_reshape(self, op): - if self.device == 'neon': - op_def = self.CommonConvert(op, 'Reshape') - else: - op_def = self.CommonConvert(op, 'ReOrganize') - input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - output_shape = input_shape - shape_param = np.asarray(op.layer.reshape_param.shape.dim) - if self.device != 'neon': - shape_param = shape_param[[0, 3, 1, 2]] - for i in range(len(shape_param)): - if shape_param[i] != 0: - output_shape[i] = shape_param[i] - shape_arg = op_def.arg.add() - shape_arg.name = 'shape' - shape_arg.ints.extend(output_shape) - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_proposal_op(self, op): - assert self.device == 'cpu' - op_def = self.CommonConvert(op, op.type) - if op.layer.HasField('proposal_param'): - proposal_param = op.layer.proposal_param - feat_stride_arg = op_def.arg.add() - feat_stride_arg.name = 'feat_stride' - feat_stride_arg.i = proposal_param.feat_stride - scales_arg = op_def.arg.add() - scales_arg.name = 'scales' - scales_arg.ints.extend(list(proposal_param.scales)) - ratios_arg = op_def.arg.add() - ratios_arg.name = 'ratios' - ratios_arg.floats.extend(list(proposal_param.ratios)) - output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_psroi_align(self, op): - assert self.device == 'cpu' - op_def = self.CommonConvert(op, op.type) - if op.layer.HasField('psroi_align_param'): - psroi_align_param = op.layer.psroi_align_param - spatial_scale_arg = op_def.arg.add() - spatial_scale_arg.name = 'spatial_scale' - spatial_scale_arg.f = psroi_align_param.spatial_scale - output_dim_arg = op_def.arg.add() - output_dim_arg.name = 'output_dim' - output_dim_arg.i = psroi_align_param.output_dim - group_size_arg = op_def.arg.add() - group_size_arg.name = 'group_size' - group_size_arg.i = psroi_align_param.group_size - output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def replace_in_out_name(self, input_names, output_names): - in_names = set([input_name + ":0" for input_name in input_names]) - out_names = set([output_name + ":0" for output_name in output_names]) - for op in self.net_def.op: - for i in range(len(op.input)): - if op.input[i] in in_names: - op.input[i] = MACE_INPUT_NODE_NAME + '_' + op.input[i] - if op.input[i] in out_names: - op.input[i] = MACE_OUTPUT_NODE_NAME + '_' + op.input[i] - for i in range(len(op.output)): - if op.output[i] in in_names: - op.output[i] = MACE_INPUT_NODE_NAME + '_' + op.output[i] - if op.output[i] in out_names: - op.output[i] = MACE_OUTPUT_NODE_NAME + '_' + op.output[i] - - def add_input_op_shape(self, input_nodes, input_shapes): - assert len(input_nodes) == len(input_shapes) - for i in range(len(input_nodes)): - input_op = self.ops_map[input_nodes[i]] - input_shape = input_shapes[i] if self.device != 'neon' else \ - [input_shapes[i][0], input_shapes[i][3], input_shapes[i][1], input_shapes[i][2]] - if input_op.layer is not None: - input_op.output_shape_map[input_op.layer.top[0]] = input_shape - else: - input_op.output_shape_map[input_op.name] = input_shape - - def add_neon_input_transform(self, names): - for name in names: - new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = name - op_def.type = 'Transpose' - op_def.input.extend([new_input_name]) - op_def.output.extend([name+':0']) - - dims_arg = op_def.arg.add() - dims_arg.name = 'dims' - dims_arg.ints.extend([0, 3, 1, 2]) # NHWC -> NCHW - - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - - def add_neon_output_transform(self, names): - for name in names: - output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'Transpose' - op_def.input.extend([name+':0']) - op_def.output.extend([output_name]) - - dims_arg = op_def.arg.add() - dims_arg.name = 'dims' - dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC - - def convert(self, input_nodes, input_shapes, output_nodes): - if self.device == 'gpu': - self.add_input_transform(input_nodes) - - if self.device == 'neon': - self.add_neon_input_transform(input_nodes) - - assert self.ops[0].type == 'Input' - self.add_input_op_shape(input_nodes, input_shapes) - - for op in self.ops: - if op.name in self.resolved_ops: - continue - if op.type == 'Input': + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' + output_shape = Shapes.conv_pool_shape( + op.get_single_parent().output_shape_map[op.layer.bottom[0]], + weight_data.shape, paddings, strides, dilations, math.floor, + input_format) + op.output_shape_map[op.layer.top[0]] = output_shape + + if len(self.ops_map[final_op.name].children) == 1 and \ + self.ops_map[final_op.name].children[0].type \ + in activation_name_map: + activation_op = self.ops_map[final_op.name].children[0] + if not is_depthwise: + op_def.type = "FusedConv2D" + fused_act_arg = op_def.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + final_op = activation_op + final_op.output_shape_map[final_op.layer.top[0]] = output_shape + self.resolved_ops.add(activation_op.name) + + op_def.output.extend([final_op.name + ':0']) + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + + def check_winograd_conv(self, op): + # TODO: support winograd conv on neon + if self.device == 'neon': + return False + param = op.layer.convolution_param + filter_shape = np.asarray(op.data[0].shape) + if self.device != 'neon': + filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI + paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) + + dilations = [1, 1] + if len(param.dilation) > 0: + if len(param.dilation) == 1: + dilations = [param.dilation[0], param.dilation[0]] + elif len(param.dilation) == 2: + dilations = [param.dilation[0], param.dilation[1]] + + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' + output_shape = Shapes.conv_pool_shape( + op.get_single_parent().output_shape_map[op.layer.bottom[0]], + filter_shape, paddings, strides, dilations, math.floor, + input_format) + width = output_shape[0] * ((output_shape[1] + 1) / 2) * (( + output_shape[2] + 1) / 2) + if self.winograd and dilations[0] == 1 and \ + (dilations[0] == dilations[1]) and \ + (strides[0] == 1) and (strides[0] == strides[1]): + if self.device == 'gpu': + return filter_shape[0] == 3 and \ + (filter_shape[0] == filter_shape[1]) and \ + (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ + (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ + (width < OPENCL_IMAGE_MAX_SIZE) + elif self.device == 'neon': + return filter_shape[2] == 3 and ( + filter_shape[2] == filter_shape[3]) + return False + + def convert_winograd_conv(self, op): + # Add filter + weight_tensor_name = op.name + '_weight:0' + self.add_tensor(weight_tensor_name, op.data[0]) + + buffer_type = "WINOGRAD_FILTER" + filter_name = self.add_buffer_to_image(weight_tensor_name, buffer_type) + + param = op.layer.convolution_param + paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) + + filter_shape = np.asarray(op.data[0].shape) + if self.device != 'neon': + filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI + + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' + output_shape = Shapes.conv_pool_shape( + op.get_single_parent().output_shape_map[op.layer.bottom[0]], + filter_shape, paddings, strides, [1, 1], math.floor, input_format) + + # Input transform + wt_op = mace_pb2.OperatorDef() + arg = wt_op.arg.add() + arg.name = 'T' + arg.i = self.dt + padding_arg = wt_op.arg.add() + padding_arg.name = 'padding_values' + padding_arg.ints.extend(paddings) + wt_op.name = op.name + '_input_transform' + wt_op.type = 'WinogradTransform' + wt_op.input.extend([name + ':0' for name in self.inputs_map[op.name]]) + wt_output_name = wt_op.name + ":0" + wt_op.output.extend([wt_output_name]) + wt_output_shape = mace_pb2.OutputShape() + if self.device != 'neon': + wt_output_width = output_shape[0] * (( + output_shape[1] + 1) / 2) * ((output_shape[2] + 1) / 2) + wt_output_shape.dims.extend( + [16, filter_shape[3], wt_output_width, 1]) + else: + wt_output_width = output_shape[0] * (( + output_shape[2] + 1) / 2) * ((output_shape[3] + 1) / 2) + wt_output_shape.dims.extend( + [16, filter_shape[1], wt_output_width, 1]) + wt_op.output_shape.extend([wt_output_shape]) + + # MatMul + matmul_op = mace_pb2.OperatorDef() + arg = matmul_op.arg.add() + arg.name = 'T' + arg.i = self.dt + matmul_op.name = op.name + '_matmul' + matmul_op.type = 'MatMul' + matmul_op.input.extend([filter_name, wt_output_name]) + matmul_output_name = matmul_op.name + ":0" + matmul_op.output.extend([matmul_output_name]) + matmul_output_shape = mace_pb2.OutputShape() + if self.device != 'neon': + matmul_output_shape.dims.extend( + [16, filter_shape[2], wt_output_width, 1]) + else: + matmul_output_shape.dims.extend( + [16, filter_shape[0], wt_output_width, 1]) + matmul_op.output_shape.extend([matmul_output_shape]) + + # Inverse transform + iwt_op = mace_pb2.OperatorDef() + arg = iwt_op.arg.add() + arg.name = 'T' + arg.i = self.dt + batch_arg = iwt_op.arg.add() + batch_arg.name = 'batch' + batch_arg.i = output_shape[0] + height_arg = iwt_op.arg.add() + height_arg.name = 'height' + height_arg.i = output_shape[ + 1] if self.device != 'neon' else output_shape[2] + width_arg = iwt_op.arg.add() + width_arg.name = 'width' + width_arg.i = output_shape[ + 2] if self.device != 'neon' else output_shape[3] + iwt_op.name = op.name + '_inverse_transform' + iwt_op.type = 'WinogradInverseTransform' + iwt_op.input.extend([matmul_output_name]) + + # Add Bias + if len(op.data) == 2: + bias_tensor_name = op.name + '_bias:0' + bias_data = op.data[1].reshape(-1) + self.add_tensor(bias_tensor_name, bias_data) + output_name = self.add_buffer_to_image(bias_tensor_name, + "ARGUMENT") + iwt_op.input.extend([output_name]) + + final_op = op + final_op.output_shape_map[final_op.layer.top[0]] = output_shape self.resolved_ops.add(op.name) - elif op.type == 'Convolution': - if self.check_winograd_conv(op): - self.convert_winograd_conv(op) + + if len(self.ops_map[final_op.name].children) == 1 and \ + self.ops_map[final_op.name].children[0].type \ + in activation_name_map: + activation_op = self.ops_map[final_op.name].children[0] + fused_act_arg = iwt_op.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + final_op = activation_op + final_op.output_shape_map[final_op.layer.top[0]] = output_shape + self.resolved_ops.add(activation_op.name) + + iwt_op.output.extend([final_op.name + ':0']) + self.add_output_shape(iwt_op, output_shape) + self.net_def.op.extend([wt_op, matmul_op, iwt_op]) + + def convert_batchnorm(self, op): + if len(op.children) != 1 or op.children[0].type != 'Scale': + raise Exception('Now only support BatchNorm+Scale') + op_def = self.CommonConvert(op, 'FoldedBatchNorm') + scale_op = op.children[0] + + epsilon_value = op.layer.batch_norm_param.eps + if op.data[2][0] != 0: + mean_value = (1. / op.data[2][0]) * op.data[0] + var_value = (1. / op.data[2][0]) * op.data[1] else: - self.convert_conv2d(op) - elif op.type == 'BatchNorm': - self.convert_batchnorm(op) - elif op.type == 'InnerProduct': - self.convert_inner_product(op) - elif op.type == 'Pooling': - self.convert_pooling(op) - elif op.type == 'PReLU': - self.convert_prelu(op) - elif op.type in ['ReLU', 'Sigmoid', 'TanH']: - self.convert_activation(op) - elif op.type == 'Add': - self.convert_add(op) - elif op.type == 'Concat': - self.convert_concat(op) - elif op.type == 'Eltwise': - self.convert_eltwise(op) - elif op.type == 'Slice': - self.convert_slice(op) - elif op.type == 'Reshape': - self.convert_reshape(op) - elif op.type == 'Proposal': - self.convert_proposal_op(op) - elif op.type == 'PSROIAlign': - self.convert_psroi_align(op) - elif op.type in ['Softmax']: - self.convert_normal_op(op) - else: - raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) - - if self.device == 'gpu': - self.add_output_transform(output_nodes) - - if self.device == 'cpu': - self.replace_in_out_name(input_nodes, output_nodes) - - if self.device == 'neon': - self.add_neon_output_transform(output_nodes) - - for op in self.ops: - if op.name not in self.resolved_ops: - print 'Unresolve Op: %s with type %s' % (op.name, op.type) - - -def convert_to_mace_pb(model_file, weight_file, input_node_str, input_shape_str, - output_node_str, data_type, device, winograd): - net_def = mace_pb2.NetDef() - dt = data_type_map[data_type] - - caffe_net = caffe_pb2.NetParameter() - with open(model_file, "r") as f: - google.protobuf.text_format.Merge(str(f.read()), caffe_net) - - weights = caffe_pb2.NetParameter() - with open(weight_file, "rb") as f: - weights.MergeFromString(f.read()) - - input_nodes = [x for x in input_node_str.split(',')] - input_shapes = [] - if input_shape_str != "": - input_shape_strs = [x for x in input_shape_str.split(':')] - for shape_str in input_shape_strs: - input_shapes.extend([[int(x) for x in shape_str.split(',')]]) - output_nodes = [x for x in output_node_str.split(',')] - assert len(input_nodes) == len(input_shapes) - - converter = CaffeConverter(caffe_net, weights, net_def, dt, device, winograd) - converter.convert(input_nodes, input_shapes, output_nodes) - print "PB Converted." - if device == 'gpu': - print "start optimize memory." - mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) - mem_optimizer.optimize() - print "Memory optimization done." - - return net_def + raise RuntimeError('scalar is zero.') + + gamma_value = scale_op.data[0] + beta_value = np.zeros_like(mean_value) + if len(scale_op.data) == 2: + beta_value = scale_op.data[1] + + scale_value = (( + 1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) * + gamma_value).reshape(-1) + offset_value = ((-mean_value * scale_value) + beta_value).reshape(-1) + input_names = [op.name + '_scale:0', op.name + '_offset:0'] + self.add_tensor(input_names[0], scale_value) + self.add_tensor(input_names[1], offset_value) + + if self.device == 'gpu': + for name in input_names: + output_name = self.add_buffer_to_image(name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([name for name in input_names]) + + self.resolved_ops.add(op.name) + self.resolved_ops.add(scale_op.name) + final_op = scale_op + + output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[ + 0]] + + if len(self.ops_map[final_op.name].children) == 1 \ + and self.ops_map[final_op.name].children[0].type \ + in activation_name_map: + activation_op = self.ops_map[final_op.name].children[0] + fused_act_arg = op_def.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + final_op = activation_op + final_op.output_shape_map[final_op.layer.top[0]] = output_shape + self.resolved_ops.add(activation_op.name) + + op_def.output.extend([final_op.name + ':0']) + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + + def convert_inner_product(self, op): + param = op.layer.inner_product_param + try: + if param.axis != 1 or param.transpose: + raise ValueError( + 'Do not support non-default axis and transpose ' + 'case for innner product') + except AttributeError: + pass + + op_def = self.CommonConvert(op, 'FC') + weight_tensor_name = op.name + '_weight:0' + if op.data[0].ndim not in [2, 4]: + raise ValueError('Unexpected weigth ndim.') + if op.data[0].ndim == 4 and list(op.data[0].shape[:2]) != [1, 1]: + raise ValueError( + 'Do not support 4D weight with shape [1, 1, *, *]') + input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[ + 0]] + + weight_data = op.data[0].reshape(-1, op.data[0].shape[-1]) + assert weight_data.shape[1] == ( + input_shape[1] * input_shape[2] * input_shape[3]) + if self.device != 'neon': + weight_data = weight_data.reshape(-1, input_shape[3], + input_shape[1], input_shape[2]) + weight_data = weight_data.transpose((0, 2, 3, 1)).reshape( + weight_data.shape[0], -1) + self.add_tensor(weight_tensor_name, weight_data) + if self.device == 'gpu': + if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE and \ + (weight_data.shape[1] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE: + raise Exception( + 'Mace gpu do not support FC with weight shape: ' + + str(weight_data.shape)) + if input_shape[3] % 4 == 0: + buffer_type = "WEIGHT_WIDTH" + else: + buffer_type = "WEIGHT_HEIGHT" + weight_type_arg = op_def.arg.add() + weight_type_arg.name = 'weight_type' + weight_type_arg.i = buffer_type_map['WEIGHT_HEIGHT'] + + if buffer_type == "WEIGHT_HEIGHT" and \ + (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE: + raise Exception( + 'Mace gpu do not support FC with weight shape: ' + + str(weight_data.shape)) + output_name = self.add_buffer_to_image(weight_tensor_name, + buffer_type) + op_def.input.extend([output_name]) + else: + op_def.input.extend([weight_tensor_name]) + + # Add Bias + if len(op.data) == 2: + bias_tensor_name = op.name + '_bias:0' + bias_data = op.data[1].reshape(-1) + self.add_tensor(bias_tensor_name, bias_data) + if self.device == 'gpu': + output_name = self.add_buffer_to_image(bias_tensor_name, + "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([bias_tensor_name]) + + self.resolved_ops.add(op.name) + output_shape = Shapes.fully_connected_shape(input_shape, + weight_data.shape) + op.output_shape_map[op.layer.top[0]] = output_shape + final_op = op + + if len(self.ops_map[final_op.name].children) == 1 \ + and self.ops_map[final_op.name].children[0].type \ + in activation_name_map: + activation_op = self.ops_map[final_op.name].children[0] + fused_act_arg = op_def.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + final_op = activation_op + final_op.output_shape_map[final_op.layer.top[0]] = output_shape + self.resolved_ops.add(activation_op.name) + + op_def.output.extend([final_op.name + ':0']) + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + + def convert_pooling(self, op): + op_def = self.CommonConvert(op, 'Pooling') + + param = op.layer.pooling_param + paddings, strides, kernels = self.add_stride_pad_kernel_arg( + param, op_def) + if param.pool == caffe_pb2.PoolingParameter.MAX: + pooling_type = "MaxPool" + elif param.pool == caffe_pb2.PoolingParameter.AVE: + pooling_type = "AvgPool" + pooling_type_arg = op_def.arg.add() + pooling_type_arg.name = 'pooling_type' + pooling_type_arg.i = pooling_type_mode[pooling_type] + + input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[ + 0]] + if param.HasField('global_pooling') and param.global_pooling: + kernels = [input_shape[1], input_shape[2]] + + kernel_arg = op_def.arg.add() + kernel_arg.name = 'kernels' + kernel_arg.ints.extend(kernels) + + if self.device != 'neon': + filter_shape = [ + kernels[0], kernels[1], input_shape[3], input_shape[3] + ] + else: + filter_shape = [ + input_shape[1], input_shape[1], kernels[0], kernels[1] + ] + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' + output_shape = Shapes.conv_pool_shape(input_shape, filter_shape, + paddings, strides, [1, 1], + math.ceil, input_format) + op.output_shape_map[op.layer.top[0]] = output_shape + + op_def.output.extend([op.name + ':0']) + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_activation(self, op): + op_def = self.CommonConvert(op, 'Activation') + activation_arg = op_def.arg.add() + activation_arg.name = 'activation' + activation_arg.s = activation_name_map[op.type] + op_def.output.extend([op.name + ':0']) + output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[ + 0]] + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_prelu(self, op): + op_def = self.CommonConvert(op, 'Activation') + activation_arg = op_def.arg.add() + activation_arg.name = 'activation' + activation_arg.s = 'PRELU' + alpha_tensor_name = op.name + '_alpha:0' + alpha_data = op.data[0].reshape(-1) + self.add_tensor(alpha_tensor_name, alpha_data) + if self.device == 'gpu': + output_name = self.add_buffer_to_image(alpha_tensor_name, + "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([alpha_tensor_name]) + op_def.output.extend([op.name + ':0']) + output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[ + 0]] + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_add(self, op): + op_def = self.CommonConvert(op, 'AddN') + op_def.output.extend([op.name + ':0']) + output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_concat(self, op): + op_def = self.CommonConvert(op, 'Concat') + axis_arg = op_def.arg.add() + axis_arg.name = 'axis' + axis_arg.i = 3 if self.device != 'neon' else 1 + try: + if op.layer.concat_param.HasFeild('axis'): + axis_arg.i = op.concat_param.axis + elif op.layer.concat_param.HasFeild('concat_dim'): + axis_arg.i = op.concat_param.concat_dim + except AttributeError: + pass + + input_shapes = [] + for i in range(len(op.parents)): + input_shapes.append( + op.parents[i].output_shape_map[op.layer.bottom[i]]) + output_shape = Shapes.concat_shape(input_shapes, axis_arg.i) + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + op_def.output.extend([op.name + ':0']) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_eltwise(self, op): + op_def = self.CommonConvert(op, 'Eltwise') + param = op.layer.eltwise_param + type_arg = op_def.arg.add() + type_arg.name = 'type' + type_arg.i = param.operation + if len(param.coeff) > 0: + coeff_arg = op_def.arg.add() + coeff_arg.name = 'coeff' + coeff_arg.ints.extend(list(param.coeff)) + + output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + op_def.output.extend([op.name + ':0']) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_slice(self, op): + op_def = self.CommonConvert(op, 'Slice') + if op.layer.HasField('slice_param'): + param = op.layer.slice_param + if param.HasField('axis') and param.axis != 1: + raise Exception( + 'Mace do not support slice with axis ' + str(param.axis)) + if len(param.slice_point) > 0: + raise Exception('Mace do not support slice with slice_point') + + axis_arg = op_def.arg.add() + axis_arg.name = 'axis' + axis_arg.i = 3 if self.device != 'neon' else 1 + + input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] + num_outputs = len(op.layer.top) + input_channels = input_shape[axis_arg.i] + if (input_channels % num_outputs) != 0 or \ + (self.device == 'gpu' and + ((input_channels / num_outputs) % 4 != 0)): + raise Exception( + 'Mace do not support slice with input shape ' + + str(input_shape) + ' and number of output ' + str(num_outputs)) + input_format = 'NCHW' if self.device == 'neon' else 'NHWC' + output_shape = Shapes.slice_shape(input_shape, num_outputs, + input_format) + for i in range(len(op.layer.top)): + op.output_shape_map[op.layer.top[i]] = output_shape + self.add_output_shape(op_def, output_shape) + op_def.output.extend([op.name + '_' + str(i) + ':0']) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_normal_op(self, op): + op_def = self.CommonConvert(op, op.type) + output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + op_def.output.extend([op.name + ':0']) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_reshape(self, op): + if self.device == 'neon': + op_def = self.CommonConvert(op, 'Reshape') + else: + op_def = self.CommonConvert(op, 'ReOrganize') + input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] + output_shape = input_shape + shape_param = np.asarray(op.layer.reshape_param.shape.dim) + if self.device != 'neon': + shape_param = shape_param[[0, 3, 1, 2]] + for i in range(len(shape_param)): + if shape_param[i] != 0: + output_shape[i] = shape_param[i] + shape_arg = op_def.arg.add() + shape_arg.name = 'shape' + shape_arg.ints.extend(output_shape) + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + op_def.output.extend([op.name + ':0']) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_proposal_op(self, op): + assert self.device == 'cpu' + op_def = self.CommonConvert(op, op.type) + if op.layer.HasField('proposal_param'): + proposal_param = op.layer.proposal_param + feat_stride_arg = op_def.arg.add() + feat_stride_arg.name = 'feat_stride' + feat_stride_arg.i = proposal_param.feat_stride + scales_arg = op_def.arg.add() + scales_arg.name = 'scales' + scales_arg.ints.extend(list(proposal_param.scales)) + ratios_arg = op_def.arg.add() + ratios_arg.name = 'ratios' + ratios_arg.floats.extend(list(proposal_param.ratios)) + output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + op_def.output.extend([op.name + ':0']) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def convert_psroi_align(self, op): + assert self.device == 'cpu' + op_def = self.CommonConvert(op, op.type) + if op.layer.HasField('psroi_align_param'): + psroi_align_param = op.layer.psroi_align_param + spatial_scale_arg = op_def.arg.add() + spatial_scale_arg.name = 'spatial_scale' + spatial_scale_arg.f = psroi_align_param.spatial_scale + output_dim_arg = op_def.arg.add() + output_dim_arg.name = 'output_dim' + output_dim_arg.i = psroi_align_param.output_dim + group_size_arg = op_def.arg.add() + group_size_arg.name = 'group_size' + group_size_arg.i = psroi_align_param.group_size + output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] + op.output_shape_map[op.layer.top[0]] = output_shape + self.add_output_shape(op_def, output_shape) + op_def.output.extend([op.name + ':0']) + self.net_def.op.extend([op_def]) + self.resolved_ops.add(op.name) + + def replace_in_out_name(self, input_names, output_names): + in_names = set([input_name + ":0" for input_name in input_names]) + out_names = set([output_name + ":0" for output_name in output_names]) + for op in self.net_def.op: + for i in range(len(op.input)): + if op.input[i] in in_names: + op.input[i] = MACE_INPUT_NODE_NAME + '_' + op.input[i] + if op.input[i] in out_names: + op.input[i] = MACE_OUTPUT_NODE_NAME + '_' + op.input[i] + for i in range(len(op.output)): + if op.output[i] in in_names: + op.output[i] = MACE_INPUT_NODE_NAME + '_' + op.output[i] + if op.output[i] in out_names: + op.output[i] = MACE_OUTPUT_NODE_NAME + '_' + op.output[i] + + def add_input_op_shape(self, input_nodes, input_shapes): + assert len(input_nodes) == len(input_shapes) + for i in range(len(input_nodes)): + input_op = self.ops_map[input_nodes[i]] + input_shape = input_shapes[i] if self.device != 'neon' else \ + [input_shapes[i][0], input_shapes[i][3], + input_shapes[i][1], input_shapes[i][2]] + if input_op.layer is not None: + input_op.output_shape_map[input_op.layer.top[0]] = input_shape + else: + input_op.output_shape_map[input_op.name] = input_shape + + def add_neon_input_transform(self, names): + for name in names: + new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = name + op_def.type = 'Transpose' + op_def.input.extend([new_input_name]) + op_def.output.extend([name + ':0']) + + dims_arg = op_def.arg.add() + dims_arg.name = 'dims' + dims_arg.ints.extend([0, 3, 1, 2]) # NHWC -> NCHW + + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + def add_neon_output_transform(self, names): + for name in names: + output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'Transpose' + op_def.input.extend([name + ':0']) + op_def.output.extend([output_name]) + + dims_arg = op_def.arg.add() + dims_arg.name = 'dims' + dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC + + def convert(self, input_nodes, input_shapes, output_nodes): + if self.device == 'gpu': + self.add_input_transform(input_nodes) + + if self.device == 'neon': + self.add_neon_input_transform(input_nodes) + + assert self.ops[0].type == 'Input' + self.add_input_op_shape(input_nodes, input_shapes) + + for op in self.ops: + if op.name in self.resolved_ops: + continue + if op.type == 'Input': + self.resolved_ops.add(op.name) + elif op.type == 'Convolution': + if self.check_winograd_conv(op): + self.convert_winograd_conv(op) + else: + self.convert_conv2d(op) + elif op.type == 'BatchNorm': + self.convert_batchnorm(op) + elif op.type == 'InnerProduct': + self.convert_inner_product(op) + elif op.type == 'Pooling': + self.convert_pooling(op) + elif op.type == 'PReLU': + self.convert_prelu(op) + elif op.type in ['ReLU', 'Sigmoid', 'TanH']: + self.convert_activation(op) + elif op.type == 'Add': + self.convert_add(op) + elif op.type == 'Concat': + self.convert_concat(op) + elif op.type == 'Eltwise': + self.convert_eltwise(op) + elif op.type == 'Slice': + self.convert_slice(op) + elif op.type == 'Reshape': + self.convert_reshape(op) + elif op.type == 'Proposal': + self.convert_proposal_op(op) + elif op.type == 'PSROIAlign': + self.convert_psroi_align(op) + elif op.type in ['Softmax']: + self.convert_normal_op(op) + else: + raise Exception('Unknown Op: %s, type: %s' % (op.name, + op.type)) + + if self.device == 'gpu': + self.add_output_transform(output_nodes) + + if self.device == 'cpu': + self.replace_in_out_name(input_nodes, output_nodes) + + if self.device == 'neon': + self.add_neon_output_transform(output_nodes) + + for op in self.ops: + if op.name not in self.resolved_ops: + print 'Unresolve Op: %s with type %s' % (op.name, op.type) + + +def convert_to_mace_pb(model_file, weight_file, input_node_str, + input_shape_str, output_node_str, data_type, device, + winograd): + net_def = mace_pb2.NetDef() + dt = data_type_map[data_type] + + caffe_net = caffe_pb2.NetParameter() + with open(model_file, "r") as f: + google.protobuf.text_format.Merge(str(f.read()), caffe_net) + + weights = caffe_pb2.NetParameter() + with open(weight_file, "rb") as f: + weights.MergeFromString(f.read()) + + input_nodes = [x for x in input_node_str.split(',')] + input_shapes = [] + if input_shape_str != "": + input_shape_strs = [x for x in input_shape_str.split(':')] + for shape_str in input_shape_strs: + input_shapes.extend([[int(x) for x in shape_str.split(',')]]) + output_nodes = [x for x in output_node_str.split(',')] + assert len(input_nodes) == len(input_shapes) + converter = CaffeConverter(caffe_net, weights, net_def, dt, device, + winograd) + converter.convert(input_nodes, input_shapes, output_nodes) + print "PB Converted." + if device == 'gpu': + print "start optimize memory." + mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) + mem_optimizer.optimize() + print "Memory optimization done." + + return net_def diff --git a/mace/python/tools/convert_util.py b/mace/python/tools/convert_util.py index 53b31969..574e1e58 100644 --- a/mace/python/tools/convert_util.py +++ b/mace/python/tools/convert_util.py @@ -26,4 +26,3 @@ def tf_dtype_2_mace_dtype(tf_dtype): if not mace_dtype: raise Exception("Not supported tensorflow dtype: " + tf_dtype) return mace_dtype - diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index ee4b942e..c4592d5a 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -4,176 +4,166 @@ import hashlib import os.path from mace.python.tools import source_converter_lib -# ./bazel-bin/mace/python/tools/tf_converter --model_file quantized_test.pb --output quantized_test_dsp.pb --runtime dsp --input_dim input_node,1,28,28,3 +# ./bazel-bin/mace/python/tools/tf_converter --model_file quantized_test.pb \ +# --output quantized_test_dsp.pb \ +# --runtime dsp \ +# --input_dim input_node,1,28,28,3 FLAGS = None + def file_checksum(fname): - hash_func = hashlib.sha256() - with open(fname, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_func.update(chunk) - return hash_func.hexdigest() + hash_func = hashlib.sha256() + with open(fname, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_func.update(chunk) + return hash_func.hexdigest() + def main(unused_args): - if not os.path.isfile(FLAGS.model_file): - print("Input graph file '" + FLAGS.model_file + "' does not exist!") - sys.exit(-1) - - model_checksum = file_checksum(FLAGS.model_file) - if FLAGS.model_checksum != "" and FLAGS.model_checksum != model_checksum: - print("Model checksum mismatch: %s != %s" % (model_checksum, FLAGS.model_checksum)) - sys.exit(-1) - - if FLAGS.platform == 'caffe': - if not os.path.isfile(FLAGS.weight_file): - print("Input weight file '" + FLAGS.weight_file + "' does not exist!") - sys.exit(-1) - - weight_checksum = file_checksum(FLAGS.weight_file) - if FLAGS.weight_checksum != "" and FLAGS.weight_checksum != weight_checksum: - print("Weight checksum mismatch: %s != %s" % (weight_checksum, FLAGS.weight_checksum)) - sys.exit(-1) - - if FLAGS.runtime == 'dsp': - print("DSP not support caffe model yet.") - sys.exit(-1) - - from mace.python.tools import caffe_converter_lib - output_graph_def = caffe_converter_lib.convert_to_mace_pb( - FLAGS.model_file, FLAGS.weight_file, FLAGS.input_node, FLAGS.input_shape, FLAGS.output_node, - FLAGS.data_type, FLAGS.runtime, FLAGS.winograd) - elif FLAGS.platform == 'tensorflow': - if FLAGS.runtime == 'dsp': - from mace.python.tools import tf_dsp_converter_lib - output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb( - FLAGS.model_file, FLAGS.input_node, FLAGS.output_node, FLAGS.dsp_mode) + if not os.path.isfile(FLAGS.model_file): + print("Input graph file '" + FLAGS.model_file + "' does not exist!") + sys.exit(-1) + + model_checksum = file_checksum(FLAGS.model_file) + if FLAGS.model_checksum != "" and FLAGS.model_checksum != model_checksum: + print("Model checksum mismatch: %s != %s" % (model_checksum, + FLAGS.model_checksum)) + sys.exit(-1) + + if FLAGS.platform == 'caffe': + if not os.path.isfile(FLAGS.weight_file): + print("Input weight file '" + FLAGS.weight_file + + "' does not exist!") + sys.exit(-1) + + weight_checksum = file_checksum(FLAGS.weight_file) + if FLAGS.weight_checksum != "" and \ + FLAGS.weight_checksum != weight_checksum: + print("Weight checksum mismatch: %s != %s" % + (weight_checksum, FLAGS.weight_checksum)) + sys.exit(-1) + + if FLAGS.runtime == 'dsp': + print("DSP not support caffe model yet.") + sys.exit(-1) + + from mace.python.tools import caffe_converter_lib + output_graph_def = caffe_converter_lib.convert_to_mace_pb( + FLAGS.model_file, FLAGS.weight_file, FLAGS.input_node, + FLAGS.input_shape, FLAGS.output_node, FLAGS.data_type, + FLAGS.runtime, FLAGS.winograd) + elif FLAGS.platform == 'tensorflow': + if FLAGS.runtime == 'dsp': + from mace.python.tools import tf_dsp_converter_lib + output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb( + FLAGS.model_file, FLAGS.input_node, FLAGS.output_node, + FLAGS.dsp_mode) + else: + from mace.python.tools import tf_converter_lib + output_graph_def = tf_converter_lib.convert_to_mace_pb( + FLAGS.model_file, FLAGS.input_node, FLAGS.input_shape, + FLAGS.output_node, FLAGS.data_type, FLAGS.runtime, + FLAGS.winograd) + + if FLAGS.output_type == 'source': + source_converter_lib.convert_to_source( + output_graph_def, model_checksum, FLAGS.template, FLAGS.obfuscate, + FLAGS.model_tag, FLAGS.output, FLAGS.runtime, + FLAGS.embed_model_data) else: - from mace.python.tools import tf_converter_lib - output_graph_def = tf_converter_lib.convert_to_mace_pb( - FLAGS.model_file, FLAGS.input_node, FLAGS.input_shape, FLAGS.output_node, - FLAGS.data_type, FLAGS.runtime, FLAGS.winograd) - - if FLAGS.output_type == 'source': - source_converter_lib.convert_to_source(output_graph_def, model_checksum, FLAGS.template, FLAGS.obfuscate, - FLAGS.model_tag, FLAGS.output, FLAGS.runtime, FLAGS.embed_model_data) - else: - with open(FLAGS.output, "wb") as f: - f.write(output_graph_def.SerializeToString()) - with open(FLAGS.output + '_txt', "wb") as f: - # output_graph_def.ClearField('tensors') - f.write(str(output_graph_def)) - print("Model conversion is completed.") + with open(FLAGS.output, "wb") as f: + f.write(output_graph_def.SerializeToString()) + with open(FLAGS.output + '_txt', "wb") as f: + # output_graph_def.ClearField('tensors') + f.write(str(output_graph_def)) + print("Model conversion is completed.") + def str2bool(v): - if v.lower() in ('yes', 'true', 't', 'y', '1'): - return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): - return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + def parse_args(): - """Parses command line arguments.""" - parser = argparse.ArgumentParser() - parser.register("type", "bool", lambda v: v.lower() == "true") - parser.add_argument( - "--model_file", - type=str, - default="", - help="TensorFlow \'GraphDef\' file to load, Caffe prototxt file to load.") - parser.add_argument( - "--weight_file", - type=str, - default="", - help="Caffe data file to load.") - parser.add_argument( - "--model_checksum", - type=str, - default="", - help="Model file sha256 checksum") - parser.add_argument( - "--weight_checksum", - type=str, - default="", - help="Weight file sha256 checksum") - parser.add_argument( - "--output", - type=str, - default="", - help="File to save the output graph to.") - parser.add_argument( - "--runtime", - type=str, - default="cpu", - help="Runtime: cpu/gpu/dsp") - parser.add_argument( - "--input_node", - type=str, - default="input_node", - help="e.g., input_node") - parser.add_argument( - "--output_node", - type=str, - default="softmax", - help="e.g., softmax") - parser.add_argument( - "--data_type", - type=str, - default='DT_FLOAT', - help="e.g., DT_HALF/DT_FLOAT") - parser.add_argument( - "--output_type", - type=str, - default="pb", - help="output type: source/pb") - parser.add_argument( - "--template", - type=str, - default="", - help="template path") - parser.add_argument( - "--obfuscate", - type=str2bool, - nargs='?', - const=False, - default=False, - help="obfuscate model names") - parser.add_argument( - "--model_tag", - type=str, - default="", - help="model tag for generated function and namespace") - parser.add_argument( - "--winograd", - type=str2bool, - nargs='?', - const=False, - default=False, - help="open winograd convolution or not") - parser.add_argument( - "--dsp_mode", - type=int, - default=0, - help="dsp run mode, defalut=0") - parser.add_argument( - "--input_shape", - type=str, - default="", - help="input shape.") - parser.add_argument( - "--platform", - type=str, - default="tensorflow", - help="tensorflow/caffe") - parser.add_argument( - "--embed_model_data", - type=str2bool, - default=True, - help="input shape.") - return parser.parse_known_args() + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.register("type", "bool", lambda v: v.lower() == "true") + parser.add_argument( + "--model_file", + type=str, + default="", + help="TensorFlow \'GraphDef\' file to load, " + "Caffe prototxt file to load.") + parser.add_argument( + "--weight_file", type=str, default="", help="Caffe data file to load.") + parser.add_argument( + "--model_checksum", + type=str, + default="", + help="Model file sha256 checksum") + parser.add_argument( + "--weight_checksum", + type=str, + default="", + help="Weight file sha256 checksum") + parser.add_argument( + "--output", + type=str, + default="", + help="File to save the output graph to.") + parser.add_argument( + "--runtime", type=str, default="cpu", help="Runtime: cpu/gpu/dsp") + parser.add_argument( + "--input_node", + type=str, + default="input_node", + help="e.g., input_node") + parser.add_argument( + "--output_node", type=str, default="softmax", help="e.g., softmax") + parser.add_argument( + "--data_type", + type=str, + default='DT_FLOAT', + help="e.g., DT_HALF/DT_FLOAT") + parser.add_argument( + "--output_type", type=str, default="pb", help="output type: source/pb") + parser.add_argument( + "--template", type=str, default="", help="template path") + parser.add_argument( + "--obfuscate", + type=str2bool, + nargs='?', + const=False, + default=False, + help="obfuscate model names") + parser.add_argument( + "--model_tag", + type=str, + default="", + help="model tag for generated function and namespace") + parser.add_argument( + "--winograd", + type=str2bool, + nargs='?', + const=False, + default=False, + help="open winograd convolution or not") + parser.add_argument( + "--dsp_mode", type=int, default=0, help="dsp run mode, defalut=0") + parser.add_argument( + "--input_shape", type=str, default="", help="input shape.") + parser.add_argument( + "--platform", type=str, default="tensorflow", help="tensorflow/caffe") + parser.add_argument( + "--embed_model_data", type=str2bool, default=True, help="input shape.") + return parser.parse_known_args() if __name__ == '__main__': - FLAGS, unparsed = parse_args() - main(unused_args=[sys.argv[0]] + unparsed) + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/mace/python/tools/dsp_ops.py b/mace/python/tools/dsp_ops.py index 8589879b..72a2f497 100644 --- a/mace/python/tools/dsp_ops.py +++ b/mace/python/tools/dsp_ops.py @@ -1,65 +1,63 @@ - class DspOps(object): - def __init__(self): - self.dsp_ops = { - 'INPUT': 'INPUT"', - 'OUTPUT': 'OUTPUT', - 'NoOp': 'Nop', - 'FLATTEN': 'Flatten', - 'Identity': 'Nop', - 'Placeholder': 'INPUT', - 'Const': 'Const', - 'QuantizedConv2D': 'QuantizedConv2d_8x8to32', - 'QuantizedMatMul': 'QuantizedMatMul_8x8to32', - 'QuantizeDownAndShrinkRange': 'QuantizeDownAndShrinkRange_32to8', - 'QuantizedRelu': 'QuantizedRelu_8', - 'QuantizedReluX': 'QuantizedReluX_8', - 'QuantizedMaxPool': 'QuantizedMaxPool_8', - 'QuantizedAvgPool': 'QuantizedAvgPool_8', - 'QuantizedConcat': 'QuantizedConcat_8', - 'QuantizedBiasAdd': 'QuantizedBiasAdd_8p8to32', - 'QuantizedResizeBilinear' : 'QuantizedResizeBilinear_8', - 'QuantizedSpaceToBatchND': 'QuantizedSpaceToBatchND_8', - 'QuantizedBatchToSpaceND': 'QuantizedBatchToSpaceND_8', - 'QuantizedSoftmax': 'QuantizedSoftmax_8', - 'QuantizedTanh': 'QuantizedTanh_8', - 'Min': 'Min_f', - 'Max': 'Max_f', - 'QuantizeV2': 'Quantize', - 'Dequantize': 'Dequantize', - 'Softmax': 'Softmax_f', - 'Reshape': 'Reshape', - 'QuantizedReshape': 'QuantizedReshape', - 'Sigmoid': 'Sigmoid_f', - 'Slice': 'Slice_f', - 'Add': 'Add_f', - 'Mul': 'Mul_f', - 'Requantize': 'Requantize_32to8', - 'RequantizationRange': 'RequantizationRange_32', - 'Sub': 'Sub_f', - 'Pack': 'Pack_int32', - 'StridedSlice': 'StridedSlice_f', - 'ExpandDims': 'ExpandDims_f', - 'QuantizedMul': 'QuantizedMul_8x8to32', - 'QuantizedAdd': 'QuantizedAdd_8p8to32', - 'Pad': 'Pad_f', - 'SpaceToBatchND': 'SpaceToBatchND_f', - 'BatchToSpaceND': 'BatchToSpaceND_f', - 'ResizeBilinear': 'ResizeBilinear_f', - 'ConcatV2': 'ConcatV2_f', - 'Conv2DBackpropInput': 'Deconv_f', - 'Tanh': 'Tanh_f', - 'Split': 'Split_f', - 'Transpose': 'Transpose_f', - 'Concat': 'Concat_f', - 'AddN': 'AddN_f', - } - def has_op(self, tf_op): - return tf_op in self.dsp_ops - - def map_nn_op(self, tf_op): - if tf_op not in self.dsp_ops: - raise Exception('Could not map nn op for: ', tf_op) - return self.dsp_ops[tf_op] + def __init__(self): + self.dsp_ops = { + 'INPUT': 'INPUT"', + 'OUTPUT': 'OUTPUT', + 'NoOp': 'Nop', + 'FLATTEN': 'Flatten', + 'Identity': 'Nop', + 'Placeholder': 'INPUT', + 'Const': 'Const', + 'QuantizedConv2D': 'QuantizedConv2d_8x8to32', + 'QuantizedMatMul': 'QuantizedMatMul_8x8to32', + 'QuantizeDownAndShrinkRange': 'QuantizeDownAndShrinkRange_32to8', + 'QuantizedRelu': 'QuantizedRelu_8', + 'QuantizedReluX': 'QuantizedReluX_8', + 'QuantizedMaxPool': 'QuantizedMaxPool_8', + 'QuantizedAvgPool': 'QuantizedAvgPool_8', + 'QuantizedConcat': 'QuantizedConcat_8', + 'QuantizedBiasAdd': 'QuantizedBiasAdd_8p8to32', + 'QuantizedResizeBilinear': 'QuantizedResizeBilinear_8', + 'QuantizedSpaceToBatchND': 'QuantizedSpaceToBatchND_8', + 'QuantizedBatchToSpaceND': 'QuantizedBatchToSpaceND_8', + 'QuantizedSoftmax': 'QuantizedSoftmax_8', + 'QuantizedTanh': 'QuantizedTanh_8', + 'Min': 'Min_f', + 'Max': 'Max_f', + 'QuantizeV2': 'Quantize', + 'Dequantize': 'Dequantize', + 'Softmax': 'Softmax_f', + 'Reshape': 'Reshape', + 'QuantizedReshape': 'QuantizedReshape', + 'Sigmoid': 'Sigmoid_f', + 'Slice': 'Slice_f', + 'Add': 'Add_f', + 'Mul': 'Mul_f', + 'Requantize': 'Requantize_32to8', + 'RequantizationRange': 'RequantizationRange_32', + 'Sub': 'Sub_f', + 'Pack': 'Pack_int32', + 'StridedSlice': 'StridedSlice_f', + 'ExpandDims': 'ExpandDims_f', + 'QuantizedMul': 'QuantizedMul_8x8to32', + 'QuantizedAdd': 'QuantizedAdd_8p8to32', + 'Pad': 'Pad_f', + 'SpaceToBatchND': 'SpaceToBatchND_f', + 'BatchToSpaceND': 'BatchToSpaceND_f', + 'ResizeBilinear': 'ResizeBilinear_f', + 'ConcatV2': 'ConcatV2_f', + 'Conv2DBackpropInput': 'Deconv_f', + 'Tanh': 'Tanh_f', + 'Split': 'Split_f', + 'Transpose': 'Transpose_f', + 'Concat': 'Concat_f', + 'AddN': 'AddN_f', + } + def has_op(self, tf_op): + return tf_op in self.dsp_ops + def map_nn_op(self, tf_op): + if tf_op not in self.dsp_ops: + raise Exception('Could not map nn op for: ', tf_op) + return self.dsp_ops[tf_op] diff --git a/mace/python/tools/encrypt_opencl_codegen.py b/mace/python/tools/encrypt_opencl_codegen.py index ad3ab8d4..b541aef5 100644 --- a/mace/python/tools/encrypt_opencl_codegen.py +++ b/mace/python/tools/encrypt_opencl_codegen.py @@ -4,77 +4,81 @@ import sys import jinja2 -# python encrypt_opencl_codegen.py --cl_kernel_dir=./mace/kernels/opencl/cl/ \ +# python encrypt_opencl_codegen.py --cl_kernel_dir=./mace/kernels/opencl/cl/ \ # --output_path=./mace/codegen/opencl_encrypt/opencl_encrypted_program.cc FLAGS = None encrypt_lookup_table = "Xiaomi-AI-Platform-Mace" + def encrypt_code(code_str): - encrypted_arr = [] - for i in range(len(code_str)): - encrypted_char = hex(ord(code_str[i]) ^ ord(encrypt_lookup_table[i % len(encrypt_lookup_table)])) - encrypted_arr.append(encrypted_char) - return encrypted_arr + encrypted_arr = [] + for i in range(len(code_str)): + encrypted_char = hex( + ord(code_str[i]) ^ ord( + encrypt_lookup_table[i % len(encrypt_lookup_table)])) + encrypted_arr.append(encrypted_char) + return encrypted_arr def main(unused_args): - if not os.path.exists(FLAGS.cl_kernel_dir): - print("Input cl_kernel_dir " + FLAGS.cl_kernel_dir + " doesn't exist!") - - header_code = "" - for file_name in os.listdir(FLAGS.cl_kernel_dir): - file_path = os.path.join(FLAGS.cl_kernel_dir, file_name) - if file_path[-2:] == ".h": - f = open(file_path, "r") - header_code += f.read() - - encrypted_code_maps = {} - for file_name in os.listdir(FLAGS.cl_kernel_dir): - file_path = os.path.join(FLAGS.cl_kernel_dir, file_name) - if file_path[-3:] == ".cl": - f = open(file_path, "r") - code_str = "" - for line in f.readlines(): - if "#include " in line: - code_str += header_code - else: - code_str += line - encrypted_code_arr = encrypt_code(code_str) - encrypted_code_maps[file_name[:-3]] = encrypted_code_arr - - env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) - cpp_cl_encrypted_kernel = env.get_template('str2vec_maps.cc.jinja2').render( - maps=encrypted_code_maps, - data_type='unsigned char', - variable_name='kEncryptedProgramMap') - - if os.path.isfile(FLAGS.output_path): - os.remove(FLAGS.output_path) - w_file = open(FLAGS.output_path, "w") - w_file.write(cpp_cl_encrypted_kernel) - w_file.close() - - print("Generate encrypted opencl source done!") + if not os.path.exists(FLAGS.cl_kernel_dir): + print("Input cl_kernel_dir " + FLAGS.cl_kernel_dir + " doesn't exist!") + + header_code = "" + for file_name in os.listdir(FLAGS.cl_kernel_dir): + file_path = os.path.join(FLAGS.cl_kernel_dir, file_name) + if file_path[-2:] == ".h": + f = open(file_path, "r") + header_code += f.read() + + encrypted_code_maps = {} + for file_name in os.listdir(FLAGS.cl_kernel_dir): + file_path = os.path.join(FLAGS.cl_kernel_dir, file_name) + if file_path[-3:] == ".cl": + f = open(file_path, "r") + code_str = "" + for line in f.readlines(): + if "#include " in line: + code_str += header_code + else: + code_str += line + encrypted_code_arr = encrypt_code(code_str) + encrypted_code_maps[file_name[:-3]] = encrypted_code_arr + + env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) + cpp_cl_encrypted_kernel = env.get_template( + 'str2vec_maps.cc.jinja2').render( + maps=encrypted_code_maps, + data_type='unsigned char', + variable_name='kEncryptedProgramMap') + + if os.path.isfile(FLAGS.output_path): + os.remove(FLAGS.output_path) + w_file = open(FLAGS.output_path, "w") + w_file.write(cpp_cl_encrypted_kernel) + w_file.close() + + print("Generate encrypted opencl source done!") def parse_args(): - """Parses command line arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument( - "--cl_kernel_dir", - type=str, - default="./mace/kernels/opencl/cl/", - help="The cl kernels directory.") - parser.add_argument( - "--output_path", - type=str, - default="./mace/examples/codegen/opencl/opencl_encrypted_program.cc", - help="The path of encrypted opencl kernels.") - return parser.parse_known_args() + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--cl_kernel_dir", + type=str, + default="./mace/kernels/opencl/cl/", + help="The cl kernels directory.") + parser.add_argument( + "--output_path", + type=str, + default="./mace/examples/codegen/opencl/opencl_encrypted_program.cc", + help="The path of encrypted opencl kernels.") + return parser.parse_known_args() if __name__ == '__main__': - FLAGS, unparsed = parse_args() - main(unused_args=[sys.argv[0]] + unparsed) + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/mace/python/tools/graph_util.py b/mace/python/tools/graph_util.py index 61f7e8bc..29072ae2 100644 --- a/mace/python/tools/graph_util.py +++ b/mace/python/tools/graph_util.py @@ -2,18 +2,21 @@ import tensorflow as tf from mace.proto import mace_pb2 from collections import OrderedDict + def sort_tf_node(node, nodes_map, ordered_nodes_map): if node.name not in ordered_nodes_map: for input_tensor_name in node.input: input_node_name = input_tensor_name.split(':')[ 0] if ':' in input_tensor_name else input_tensor_name - if input_node_name not in nodes_map or input_node_name in ordered_nodes_map: + if input_node_name not in nodes_map or \ + input_node_name in ordered_nodes_map: continue input_node = nodes_map[input_node_name] sort_tf_node(input_node, nodes_map, ordered_nodes_map) ordered_nodes_map[node.name] = node + def sort_tf_graph(graph_def): nodes_map = {} ordered_nodes_map = OrderedDict() @@ -31,13 +34,15 @@ def sort_mace_node(node, nodes_map, ordered_nodes_map): for input_tensor_name in node.input: input_node_name = input_tensor_name.split(':')[ 0] if ':' in input_tensor_name else input_tensor_name - if input_node_name not in nodes_map or input_node_name in ordered_nodes_map: + if input_node_name not in nodes_map or \ + input_node_name in ordered_nodes_map: continue input_node = nodes_map[input_node_name] sort_mace_node(input_node, nodes_map, ordered_nodes_map) ordered_nodes_map[node.name] = node + def sort_mace_graph(graph_def, output_name): nodes_map = {} ordered_nodes_map = OrderedDict() diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py index e632a22a..da92448d 100644 --- a/mace/python/tools/memory_optimizer.py +++ b/mace/python/tools/memory_optimizer.py @@ -2,120 +2,131 @@ import sys import operator from mace.proto import mace_pb2 + class MemoryOptimizer(object): - def __init__(self, net_def): - self.net_def = net_def - self.idle_mem = set() - self.op_mem = {} # op_name->mem_id - self.mem_block = {} # mem_id->[x, y] - self.total_mem_count = 0 - self.ref_counter = {} - - consumers = {} - for op in net_def.op: - if self.is_buffer_image_op(op): - continue - for ipt in op.input: - if ipt not in consumers: - consumers[ipt] = [] - consumers[ipt].append(op) - # only ref op's output tensor - for op in net_def.op: - if self.is_buffer_image_op(op): - continue - for output in op.output: - tensor_name = output - if tensor_name in consumers: - self.ref_counter[tensor_name] = len(consumers[tensor_name]) + def __init__(self, net_def): + self.net_def = net_def + self.idle_mem = set() + self.op_mem = {} # op_name->mem_id + self.mem_block = {} # mem_id->[x, y] + self.total_mem_count = 0 + self.ref_counter = {} + + consumers = {} + for op in net_def.op: + if self.is_buffer_image_op(op): + continue + for ipt in op.input: + if ipt not in consumers: + consumers[ipt] = [] + consumers[ipt].append(op) + # only ref op's output tensor + for op in net_def.op: + if self.is_buffer_image_op(op): + continue + for output in op.output: + tensor_name = output + if tensor_name in consumers: + self.ref_counter[tensor_name] = len(consumers[tensor_name]) + else: + self.ref_counter[tensor_name] = 0 + + def is_buffer_image_op(self, op): + return op.type == 'BufferToImage' or op.type == 'ImageToBuffer' + + def get_mem_size(self, op_type, output_shape): + mem_size = [0, 0] + if op_type == 'WinogradTransform' or op_type == 'MatMul': + mem_size[0] = output_shape[2] * output_shape[3] + mem_size[1] = output_shape[0] * int((output_shape[1] + 3) / 4) else: - self.ref_counter[tensor_name] = 0 - - def is_buffer_image_op(self, op): - return op.type == 'BufferToImage' or op.type == 'ImageToBuffer' - - def get_mem_size(self, op_type, output_shape): - mem_size = [0, 0] - if op_type == 'WinogradTransform' or op_type == 'MatMul': - mem_size[0] = output_shape[2] * output_shape[3] - mem_size[1] = output_shape[0] * int((output_shape[1]+3)/4) - else: - mem_size[0] = output_shape[2] * int((output_shape[3]+3)/4) - mem_size[1] = output_shape[0] * output_shape[1] - return mem_size - - def mem_area(self, memory_size): - return memory_size[0] * memory_size[1] - - def optimize(self): - for op in self.net_def.op: - if self.is_buffer_image_op(op): - continue - if not op.output_shape: - print('WARNING: There is no output shape information to do memory optimization.') - return - if len(op.output_shape) != len(op.output): - print('WARNING: the number of output shape is not equal to the number of output.') - return - for i in range(len(op.output)): - op_mem_size = self.get_mem_size(op.type, op.output_shape[i].dims) - mem_id = -1 - if len(self.idle_mem) > 0: - best_mem_candidate_id = -1 - best_mem_candidate_delta_area = sys.maxint - best_mem_candidate_shape = [] - for mid in self.idle_mem: - reuse_mem_size = self.mem_block[mid] - resize_mem_size = [max(reuse_mem_size[0], op_mem_size[0]), max(reuse_mem_size[1], op_mem_size[1])] - delta_mem_area = self.mem_area(resize_mem_size) - self.mem_area(reuse_mem_size) - if delta_mem_area < best_mem_candidate_delta_area: - best_mem_candidate_id = mid - best_mem_candidate_delta_area = delta_mem_area - best_mem_candidate_shape = resize_mem_size - - if best_mem_candidate_delta_area <= self.mem_area(op_mem_size): - # reuse - self.mem_block[best_mem_candidate_id] = best_mem_candidate_shape - mem_id = best_mem_candidate_id - self.idle_mem.remove(mem_id) - - if mem_id == -1: - mem_id = self.total_mem_count - self.total_mem_count += 1 - self.mem_block[mem_id] = op_mem_size - - op.mem_id.extend([mem_id]) - self.op_mem[op.output[i]] = mem_id - - # de-ref input tensor mem - for ipt in op.input: - if ipt in self.ref_counter: - self.ref_counter[ipt] -= 1 - if self.ref_counter[ipt] == 0: - self.idle_mem.add(self.op_mem[ipt]) - elif self.ref_counter[ipt] < 0: - raise Exception('ref count is less than 0') - - for mem in self.mem_block: - arena = self.net_def.mem_arena - block = arena.mem_block.add() - block.mem_id = mem - block.x = self.mem_block[mem][0] - block.y = self.mem_block[mem][1] - - print('total op: %d', len(self.net_def.op)) - origin_mem_size = 0 - optimized_mem_size = 0 - for op in self.net_def.op: - if self.is_buffer_image_op(op): - continue - origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1) - for mem in self.mem_block: - print mem, self.mem_block[mem] - optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4) - - print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size) + mem_size[0] = output_shape[2] * int((output_shape[3] + 3) / 4) + mem_size[1] = output_shape[0] * output_shape[1] + return mem_size + + def mem_area(self, memory_size): + return memory_size[0] * memory_size[1] + + def optimize(self): + for op in self.net_def.op: + if self.is_buffer_image_op(op): + continue + if not op.output_shape: + print('WARNING: There is no output shape information to ' + 'do memory optimization.') + return + if len(op.output_shape) != len(op.output): + print('WARNING: the number of output shape is not equal to ' + 'the number of output.') + return + for i in range(len(op.output)): + op_mem_size = self.get_mem_size(op.type, + op.output_shape[i].dims) + mem_id = -1 + if len(self.idle_mem) > 0: + best_mem_candidate_id = -1 + best_mem_candidate_delta_area = sys.maxint + best_mem_candidate_shape = [] + for mid in self.idle_mem: + reuse_mem_size = self.mem_block[mid] + resize_mem_size = [ + max(reuse_mem_size[0], op_mem_size[0]), + max(reuse_mem_size[1], op_mem_size[1]) + ] + delta_mem_area = self.mem_area( + resize_mem_size) - self.mem_area(reuse_mem_size) + if delta_mem_area < best_mem_candidate_delta_area: + best_mem_candidate_id = mid + best_mem_candidate_delta_area = delta_mem_area + best_mem_candidate_shape = resize_mem_size + + if best_mem_candidate_delta_area <= self.mem_area( + op_mem_size): + # reuse + self.mem_block[ + best_mem_candidate_id] = best_mem_candidate_shape + mem_id = best_mem_candidate_id + self.idle_mem.remove(mem_id) + + if mem_id == -1: + mem_id = self.total_mem_count + self.total_mem_count += 1 + self.mem_block[mem_id] = op_mem_size + + op.mem_id.extend([mem_id]) + self.op_mem[op.output[i]] = mem_id + + # de-ref input tensor mem + for ipt in op.input: + if ipt in self.ref_counter: + self.ref_counter[ipt] -= 1 + if self.ref_counter[ipt] == 0: + self.idle_mem.add(self.op_mem[ipt]) + elif self.ref_counter[ipt] < 0: + raise Exception('ref count is less than 0') + + for mem in self.mem_block: + arena = self.net_def.mem_arena + block = arena.mem_block.add() + block.mem_id = mem + block.x = self.mem_block[mem][0] + block.y = self.mem_block[mem][1] + + print('total op: %d', len(self.net_def.op)) + origin_mem_size = 0 + optimized_mem_size = 0 + for op in self.net_def.op: + if self.is_buffer_image_op(op): + continue + origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1) + for mem in self.mem_block: + print mem, self.mem_block[mem] + optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4) + + print('origin mem: %d, optimized mem: %d', origin_mem_size, + optimized_mem_size) def optimize_memory(net_def): - mem_optimizer = MemoryOptimizer(net_def) - mem_optimizer.optimize() + mem_optimizer = MemoryOptimizer(net_def) + mem_optimizer.optimize() diff --git a/mace/python/tools/opencl_codegen.py b/mace/python/tools/opencl_codegen.py index 96f4c6a6..061dd25b 100644 --- a/mace/python/tools/opencl_codegen.py +++ b/mace/python/tools/opencl_codegen.py @@ -14,86 +14,89 @@ FLAGS = None def generate_cpp_source(): - maps = {} - platform_info = '' - binary_dirs = FLAGS.cl_binary_dirs.strip().split(",") - for binary_dir in binary_dirs: - binary_path = os.path.join(binary_dir, FLAGS.built_kernel_file_name) - if not os.path.exists(binary_path): - continue - - print 'generate opencl code from', binary_path - with open(binary_path, "rb") as f: - binary_array = np.fromfile(f, dtype=np.uint8) - - idx = 0 - size, = struct.unpack("Q", binary_array[idx:idx+8]) - idx += 8 - for _ in xrange(size): - key_size, = struct.unpack("i", binary_array[idx:idx+4]) - idx += 4 - key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size]) - idx += key_size - value_size, = struct.unpack("i", binary_array[idx:idx+4]) - idx += 4 - maps[key] = [] - value = struct.unpack(str(value_size) + "B", - binary_array[idx:idx+value_size]) - idx += value_size - for ele in value: - maps[key].append(hex(ele)) - - cl_platform_info_path = os.path.join(binary_dir, FLAGS.platform_info_file_name) - with open(cl_platform_info_path, 'r') as f: - curr_platform_info = f.read() - if platform_info != "": - assert(curr_platform_info == platform_info) - platform_info = curr_platform_info - - env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) - return env.get_template('opencl_compiled_kernel.cc.jinja2').render( - maps = maps, - data_type = 'unsigned char', - variable_name = 'kCompiledProgramMap', - platform_info = platform_info, - ) + maps = {} + platform_info = '' + binary_dirs = FLAGS.cl_binary_dirs.strip().split(",") + for binary_dir in binary_dirs: + binary_path = os.path.join(binary_dir, FLAGS.built_kernel_file_name) + if not os.path.exists(binary_path): + continue + + print 'generate opencl code from', binary_path + with open(binary_path, "rb") as f: + binary_array = np.fromfile(f, dtype=np.uint8) + + idx = 0 + size, = struct.unpack("Q", binary_array[idx:idx + 8]) + idx += 8 + for _ in xrange(size): + key_size, = struct.unpack("i", binary_array[idx:idx + 4]) + idx += 4 + key, = struct.unpack( + str(key_size) + "s", binary_array[idx:idx + key_size]) + idx += key_size + value_size, = struct.unpack("i", binary_array[idx:idx + 4]) + idx += 4 + maps[key] = [] + value = struct.unpack( + str(value_size) + "B", binary_array[idx:idx + value_size]) + idx += value_size + for ele in value: + maps[key].append(hex(ele)) + + cl_platform_info_path = os.path.join(binary_dir, + FLAGS.platform_info_file_name) + with open(cl_platform_info_path, 'r') as f: + curr_platform_info = f.read() + if platform_info != "": + assert (curr_platform_info == platform_info) + platform_info = curr_platform_info + + env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) + return env.get_template('opencl_compiled_kernel.cc.jinja2').render( + maps=maps, + data_type='unsigned char', + variable_name='kCompiledProgramMap', + platform_info=platform_info, + ) + def main(unused_args): - cpp_cl_binary_source = generate_cpp_source() - if os.path.isfile(FLAGS.output_path): - os.remove(FLAGS.output_path) - w_file = open(FLAGS.output_path, "w") - w_file.write(cpp_cl_binary_source) - w_file.close() + cpp_cl_binary_source = generate_cpp_source() + if os.path.isfile(FLAGS.output_path): + os.remove(FLAGS.output_path) + w_file = open(FLAGS.output_path, "w") + w_file.write(cpp_cl_binary_source) + w_file.close() def parse_args(): - """Parses command line arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument( - "--cl_binary_dirs", - type=str, - default="", - help="The cl binaries directories.") - parser.add_argument( - "--built_kernel_file_name", - type=str, - default="", - help="The cl binaries directories.") - parser.add_argument( - "--platform_info_file_name", - type=str, - default="", - help="The cl binaries directories.") - parser.add_argument( - "--output_path", - type=str, - default="./mace/examples/codegen/opencl/opencl_compiled_program.cc", - help="The path of generated C++ header file which contains cl binaries.") - return parser.parse_known_args() + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--cl_binary_dirs", + type=str, + default="", + help="The cl binaries directories.") + parser.add_argument( + "--built_kernel_file_name", + type=str, + default="", + help="The cl binaries directories.") + parser.add_argument( + "--platform_info_file_name", + type=str, + default="", + help="The cl binaries directories.") + parser.add_argument( + "--output_path", + type=str, + default="./mace/examples/codegen/opencl/opencl_compiled_program.cc", + help="The path of generated C++ header file for cl binaries.") + return parser.parse_known_args() if __name__ == '__main__': - FLAGS, unparsed = parse_args() - main(unused_args=[sys.argv[0]] + unparsed) + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/mace/python/tools/source_converter_lib.py b/mace/python/tools/source_converter_lib.py index 16ad18d4..3bae0310 100644 --- a/mace/python/tools/source_converter_lib.py +++ b/mace/python/tools/source_converter_lib.py @@ -6,182 +6,196 @@ import hashlib from mace.proto import mace_pb2 from jinja2 import Environment, FileSystemLoader - GENERATED_NAME = set() + def generate_obfuscated_name(namespace, name): - md5 = hashlib.md5() - md5.update(namespace) - md5.update(name) - md5_digest = md5.hexdigest() - - name = md5_digest[:8] - while name in GENERATED_NAME: - name = md5_digest - assert name not in GENERATED_NAME - GENERATED_NAME.add(name) - return name + md5 = hashlib.md5() + md5.update(namespace) + md5.update(name) + md5_digest = md5.hexdigest() + + name = md5_digest[:8] + while name in GENERATED_NAME: + name = md5_digest + assert name not in GENERATED_NAME + GENERATED_NAME.add(name) + return name + def generate_tensor_map(tensors): - tensor_map = {} - for t in tensors: - if not tensor_map.has_key(t.name): - tensor_map[t.name] = generate_obfuscated_name("tensor", t.name) - return tensor_map + tensor_map = {} + for t in tensors: + if t.name not in tensor_map: + tensor_map[t.name] = generate_obfuscated_name("tensor", t.name) + return tensor_map + def generate_in_out_map(ops, tensor_map): - in_out_map = {} - for op in ops: - op.name = generate_obfuscated_name("op", op.name) - for input_name in op.input: - if not in_out_map.has_key(input_name): - if tensor_map.has_key(input_name): - in_out_map[input_name] = tensor_map[input_name] - else: - in_out_map[input_name] = generate_obfuscated_name("in", input_name) - for output_name in op.output: - if not in_out_map.has_key(output_name): - if tensor_map.has_key(output_name): - in_out_map[output_name] = tensor_map[output_name] - else: - in_out_map[output_name] = generate_obfuscated_name("out", output_name) - return in_out_map + in_out_map = {} + for op in ops: + op.name = generate_obfuscated_name("op", op.name) + for input_name in op.input: + if input_name not in in_out_map: + if input_name in tensor_map: + in_out_map[input_name] = tensor_map[input_name] + else: + in_out_map[input_name] = generate_obfuscated_name( + "in", input_name) + for output_name in op.output: + if output_name not in in_out_map: + if output_name in tensor_map: + in_out_map[output_name] = tensor_map[output_name] + else: + in_out_map[output_name] = generate_obfuscated_name( + "out", output_name) + return in_out_map + def obfuscate_name(net_def): - input_node = "mace_input_node" - output_node = "mace_output_node" - tensor_map = generate_tensor_map(net_def.tensors) - in_out_map = generate_in_out_map(net_def.op, tensor_map) - for t in net_def.tensors: - if input_node not in t.name and output_node not in t.name: - t.name = tensor_map[t.name] - for op in net_def.op: - for i in range(len(op.input)): - if input_node not in op.input[i]: - op.input[i] = in_out_map[op.input[i]] - for i in range(len(op.output)): - if output_node not in op.output[i]: - op.output[i] = in_out_map[op.output[i]] + input_node = "mace_input_node" + output_node = "mace_output_node" + tensor_map = generate_tensor_map(net_def.tensors) + in_out_map = generate_in_out_map(net_def.op, tensor_map) + for t in net_def.tensors: + if input_node not in t.name and output_node not in t.name: + t.name = tensor_map[t.name] + for op in net_def.op: + for i in range(len(op.input)): + if input_node not in op.input[i]: + op.input[i] = in_out_map[op.input[i]] + for i in range(len(op.output)): + if output_node not in op.output[i]: + op.output[i] = in_out_map[op.output[i]] + def rename_tensor(net_def): - tensor_map = {} - for t in net_def.tensors: - if not tensor_map.has_key(t.name): - tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_") - t.name = tensor_map[t.name] - for op in net_def.op: - for i in range(len(op.input)): - if tensor_map.has_key(op.input[i]): - op.input[i] = tensor_map[op.input[i]] - for i in range(len(op.output)): - if tensor_map.has_key(op.output[i]): - op.output[i] = tensor_map[op.output[i]] + tensor_map = {} + for t in net_def.tensors: + if t.name not in tensor_map: + tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_") + t.name = tensor_map[t.name] + for op in net_def.op: + for i in range(len(op.input)): + if op.input[i] in tensor_map: + op.input[i] = tensor_map[op.input[i]] + for i in range(len(op.output)): + if op.output[i] in tensor_map: + op.output[i] = tensor_map[op.output[i]] + class TensorInfo: - def __init__(self, id, t, runtime): - self.id = id - self.data_type = mace_pb2.DataType.Name(t.data_type) - if t.data_type == mace_pb2.DT_FLOAT: - if runtime == 'gpu': - self.data_type = mace_pb2.DT_HALF - self.data = bytearray(np.array(t.float_data).astype(np.float16).tobytes()) - else: - self.data_type = mace_pb2.DT_FLOAT - self.data = bytearray(np.array(t.float_data).astype(np.float32).tobytes()) - elif t.data_type == mace_pb2.DT_INT32: - self.data = bytearray(np.array(t.int32_data).astype(np.int32).tobytes()) - elif t.data_type == mace_pb2.DT_UINT8: - self.data = bytearray(np.array(t.int32_data).astype(np.uint8).tolist()) + def __init__(self, id, t, runtime): + self.id = id + self.data_type = mace_pb2.DataType.Name(t.data_type) + if t.data_type == mace_pb2.DT_FLOAT: + if runtime == 'gpu': + self.data_type = mace_pb2.DT_HALF + self.data = bytearray( + np.array(t.float_data).astype(np.float16).tobytes()) + else: + self.data_type = mace_pb2.DT_FLOAT + self.data = bytearray( + np.array(t.float_data).astype(np.float32).tobytes()) + elif t.data_type == mace_pb2.DT_INT32: + self.data = bytearray( + np.array(t.int32_data).astype(np.int32).tobytes()) + elif t.data_type == mace_pb2.DT_UINT8: + self.data = bytearray( + np.array(t.int32_data).astype(np.uint8).tolist()) + def stringfy(value): - return ', '.join('"{0}"'.format(w) for w in value) - -def convert_to_source(net_def, mode_pb_checksum, template_dir, obfuscate, model_tag, output, runtime, embed_model_data): - if obfuscate: - obfuscate_name(net_def) - else: - rename_tensor(net_def) - - # Capture our current directory - print template_dir - - # Create the jinja2 environment. - j2_env = Environment(loader=FileSystemLoader(template_dir), trim_blocks=True) - j2_env.filters['stringfy'] = stringfy - output_dir = os.path.dirname(output) + '/' - # generate tensor source files - template_name = 'tensor_source.jinja2' - model_data = [] - offset = 0 - counter = 0 - for t in net_def.tensors: - tensor_info = TensorInfo(counter, t, runtime) - # align - if tensor_info.data_type != 'DT_UINT8' and offset % 4 != 0: - padding = 4 - offset % 4 - model_data.extend(bytearray([0] * padding)) - offset += padding + return ', '.join('"{0}"'.format(w) for w in value) + + +def convert_to_source(net_def, mode_pb_checksum, template_dir, obfuscate, + model_tag, output, runtime, embed_model_data): + if obfuscate: + obfuscate_name(net_def) + else: + rename_tensor(net_def) + + # Capture our current directory + print template_dir + + # Create the jinja2 environment. + j2_env = Environment( + loader=FileSystemLoader(template_dir), trim_blocks=True) + j2_env.filters['stringfy'] = stringfy + output_dir = os.path.dirname(output) + '/' + # generate tensor source files + template_name = 'tensor_source.jinja2' + model_data = [] + offset = 0 + counter = 0 + for t in net_def.tensors: + tensor_info = TensorInfo(counter, t, runtime) + # align + if tensor_info.data_type != 'DT_UINT8' and offset % 4 != 0: + padding = 4 - offset % 4 + model_data.extend(bytearray([0] * padding)) + offset += padding + source = j2_env.get_template(template_name).render( + tensor_info=tensor_info, + tensor=t, + tag=model_tag, + runtime=runtime, + offset=offset, + ) + model_data.extend(tensor_info.data) + offset += len(tensor_info.data) + with open(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f: + f.write(source) + counter += 1 + + # generate tensor data + template_name = 'tensor_data.jinja2' source = j2_env.get_template(template_name).render( - tensor_info = tensor_info, - tensor = t, - tag = model_tag, - runtime = runtime, - offset = offset, - ) - model_data.extend(tensor_info.data) - offset += len(tensor_info.data) - with open(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f: - f.write(source) - counter += 1 - - # generate tensor data - template_name = 'tensor_data.jinja2' - source = j2_env.get_template(template_name).render( - tag = model_tag, - embed_model_data = embed_model_data, - model_data_size = offset, - model_data = model_data - ) - with open(output_dir + 'tensor_data' + '.cc', "wb") as f: - f.write(source) - if not embed_model_data: - f = open(output_dir + model_tag + '.data', "wb") - f.write(bytearray(model_data)) - f.close() - - # generate op source files - template_name = 'operator.jinja2' - counter = 0 - op_size = len(net_def.op) - for start in range(0, op_size, 10): + tag=model_tag, + embed_model_data=embed_model_data, + model_data_size=offset, + model_data=model_data) + with open(output_dir + 'tensor_data' + '.cc', "wb") as f: + f.write(source) + if not embed_model_data: + f = open(output_dir + model_tag + '.data', "wb") + f.write(bytearray(model_data)) + f.close() + + # generate op source files + template_name = 'operator.jinja2' + counter = 0 + op_size = len(net_def.op) + for start in range(0, op_size, 10): + source = j2_env.get_template(template_name).render( + start=start, + end=min(start + 10, op_size), + net=net_def, + tag=model_tag, + runtime=runtime, + ) + with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f: + f.write(source) + counter += 1 + + # generate model source files + template_name = 'model.jinja2' + tensors = [ + TensorInfo(i, net_def.tensors[i], runtime) + for i in range(len(net_def.tensors)) + ] source = j2_env.get_template(template_name).render( - start = start, - end = min(start+10, op_size), - net = net_def, - tag = model_tag, - runtime = runtime, - ) - with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f: - f.write(source) - counter += 1 - - # generate model source files - template_name = 'model.jinja2' - tensors = [TensorInfo(i, net_def.tensors[i], runtime) for i in range(len(net_def.tensors))] - source = j2_env.get_template(template_name).render( - tensors = tensors, - net = net_def, - tag = model_tag, - runtime = runtime, - model_pb_checksum = mode_pb_checksum - ) - with open(output, "wb") as f: - f.write(source) - - # generate model header file - template_name = 'model_header.jinja2' - source = j2_env.get_template(template_name).render( - tag = model_tag, - ) - with open(output_dir + model_tag + '.h', "wb") as f: - f.write(source) + tensors=tensors, + net=net_def, + tag=model_tag, + runtime=runtime, + model_pb_checksum=mode_pb_checksum) + with open(output, "wb") as f: + f.write(source) + + # generate model header file + template_name = 'model_header.jinja2' + source = j2_env.get_template(template_name).render(tag=model_tag, ) + with open(output_dir + model_tag + '.h', "wb") as f: + f.write(source) diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py index 22732a18..e89e31fd 100644 --- a/mace/python/tools/tf_converter_lib.py +++ b/mace/python/tools/tf_converter_lib.py @@ -8,51 +8,41 @@ from mace.python.tools import memory_optimizer from tensorflow.core.framework import graph_pb2 from tensorflow.core.framework import tensor_shape_pb2 -padding_mode = { - 'VALID': 0, - 'SAME': 1, - 'FULL': 2 -} -pooling_type_mode = { - 'AvgPool': 1, - 'MaxPool': 2 -} +padding_mode = {'VALID': 0, 'SAME': 1, 'FULL': 2} +pooling_type_mode = {'AvgPool': 1, 'MaxPool': 2} # the order should be the same as # eltwise type's in mace/kernels/eltwise.h # and also cwise type's in mace/kernels/cwise.h # cuz these math ops should have compatible with "EltWise" and "CWise" math_type_mode = { - 'MUL': 0, - 'ADD': 1, - 'MAX': 2, - 'MIN': 3, - 'SUB': 4, - 'DIV': 5, - 'NEG': 6, - 'ABS': 7 + 'MUL': 0, + 'ADD': 1, + 'MAX': 2, + 'MIN': 3, + 'SUB': 4, + 'DIV': 5, + 'NEG': 6, + 'ABS': 7 } buffer_type_map = { - 'CONV2D_FILTER' : 0, - 'IN_OUT_CHANNEL' : 1, - 'ARGUMENT' : 2, - 'IN_OUT_HEIGHT' : 3, - 'IN_OUT_WIDTH' : 4, - 'WINOGRAD_FILTER' : 5, - 'DW_CONV2D_FILTER' : 6, + 'CONV2D_FILTER': 0, + 'IN_OUT_CHANNEL': 1, + 'ARGUMENT': 2, + 'IN_OUT_HEIGHT': 3, + 'IN_OUT_WIDTH': 4, + 'WINOGRAD_FILTER': 5, + 'DW_CONV2D_FILTER': 6, } -data_type_map = { - 'DT_HALF' : mace_pb2.DT_HALF, - 'DT_FLOAT': mace_pb2.DT_FLOAT -} +data_type_map = {'DT_HALF': mace_pb2.DT_HALF, 'DT_FLOAT': mace_pb2.DT_FLOAT} activation_name_map = { - 'Relu' : 'RELU', - 'Sigmoid' : 'SIGMOID', - 'Tanh' : 'TANH', - 'Relu6' : 'RELUX' + 'Relu': 'RELU', + 'Sigmoid': 'SIGMOID', + 'Tanh': 'TANH', + 'Relu6': 'RELUX' } BATCH_NORM_ORDER = ["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"] @@ -62,1123 +52,1170 @@ MACE_OUTPUT_NODE_NAME = "mace_output_node" OPENCL_IMAGE_MAX_SIZE = 16384 + def get_input_tensor(op, index): - input_tensor = op.inputs[index] - if input_tensor.op.type == 'Reshape': - input_tensor = get_input_tensor(input_tensor.op, 0) - return input_tensor + input_tensor = op.inputs[index] + if input_tensor.op.type == 'Reshape': + input_tensor = get_input_tensor(input_tensor.op, 0) + return input_tensor + class TFConverter(object): - def __init__(self, tf_ops, net_def, dt, device, winograd): - self.net_def = net_def - self.tf_ops = tf_ops - self.dt = dt - self.device = device - self.winograd = winograd - self.tf_graph = {} - self.tf_parents = {} - self.resolved_ops = {} - self.unused_tensor = set() - self.transpose_filter_tensor = {} - self.reshape_tensor = {} - self.ops = {} - - for op in tf_ops: - self.ops[op.name] = op - - for op in tf_ops: - self.resolved_ops[op.name] = 0 - for input in op.inputs: - input_name = input.name[:-2] - if input_name not in self.tf_graph: - self.tf_graph[input_name] = [] - self.tf_graph[input_name].append(op) - if op.name not in self.tf_parents: - self.tf_parents[op.name] = [] - self.tf_parents[op.name].append(self.ops[input_name]) - - def add_buffer_to_image(self, input_name, input_type): - output_name = input_name[:-2] + "_b2i" + input_name[-2:] - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'BufferToImage' - op_def.input.extend([input_name]) - op_def.output.extend([output_name]) - - arg = op_def.arg.add() - arg.name = 'buffer_type' - arg.i = buffer_type_map[input_type] - arg = op_def.arg.add() - arg.name = 'mode' - arg.i = 0 - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - return output_name - - def add_image_to_buffer(self, input_name, input_type): - output_name = input_name[:-2] + "_i2b" + input_name[-2:] - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'ImageToBuffer' - op_def.input.extend([input_name]) - op_def.output.extend([output_name]) - - arg = op_def.arg.add() - arg.name = 'buffer_type' - arg.i = buffer_type_map[input_type] - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - return output_name - - def add_gpu_input_transform(self, names): - for name in names: - new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = name - op_def.type = 'BufferToImage' - op_def.input.extend([new_input_name]) - op_def.output.extend([name+':0']) - - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'buffer_type' - epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] - - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - - def add_neon_input_transform(self, names): - for name in names: - new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = name - op_def.type = 'Transpose' - op_def.input.extend([new_input_name]) - op_def.output.extend([name+':0']) - - dims_arg = op_def.arg.add() - dims_arg.name = 'dims' - dims_arg.ints.extend([0, 3, 1, 2]) - - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - - def add_gpu_output_transform(self, names): - for name in names: - output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'ImageToBuffer' - op_def.input.extend([name+':0']) - op_def.output.extend([output_name]) - - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'buffer_type' - epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] - - def add_neon_output_transform(self, names): - for name in names: - output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'Transpose' - op_def.input.extend([name+':0']) - op_def.output.extend([output_name]) - - dims_arg = op_def.arg.add() - dims_arg.name = 'dims' - dims_arg.ints.extend([0, 2, 3, 1]) - - @staticmethod - def add_output_shape(outputs, op): - output_shapes = [] - for output in outputs: - output_shape = mace_pb2.OutputShape() - if isinstance(output, list): - output_shape.dims.extend(output) - elif isinstance(output, tf.Tensor): - if output.shape.num_elements() is not None: - output_shape.dims.extend(output.shape.as_list()) - else: - raise ValueError('output type not supported: ', type(output)) - output_shapes.append(output_shape) - op.output_shape.extend(output_shapes) - - def add_tensor(self, name, shape, tf_dt, value): - tensor = self.net_def.tensors.add() - tensor.name = name - - shape = list(shape) - tensor.dims.extend(shape) - - if tf_dt == tf.float32: - tensor.data_type = mace_pb2.DT_FLOAT - tensor.float_data.extend(value.flat) - elif tf_dt == tf.int32: - tensor.data_type = mace_pb2.DT_INT32 - tensor.int32_data.extend(value.flat) - else: - raise Exception("Not supported tensor type: " + tf_dt.name) - - def convert_reshape(self, op): - input_tensor = get_input_tensor(op, 0) - shape_tensor = get_input_tensor(op, 1) - shape_value = shape_tensor.eval().astype(np.int32) - self.unused_tensor.add(shape_tensor.name) - self.reshape_tensor[input_tensor.name] = shape_value - self.resolved_ops[op.name] = 1 - - def convert_tensor(self, op): - output_name = op.outputs[0].name - if output_name not in self.unused_tensor: - tensor = self.net_def.tensors.add() - tf_tensor = op.outputs[0].eval() - if output_name in self.transpose_filter_tensor: - tf_tensor = tf_tensor.transpose(self.transpose_filter_tensor[output_name]) - if output_name in self.reshape_tensor: - tf_tensor = tf_tensor.reshape(self.reshape_tensor[output_name]) - tensor.name = op.outputs[0].name - - shape = list(tf_tensor.shape) - tensor.dims.extend(shape) - - tf_dt = op.get_attr('dtype') - if tf_dt == tf.float32: - tensor.data_type = mace_pb2.DT_FLOAT - tensor.float_data.extend(tf_tensor.astype(np.float32).flat) - elif tf_dt == tf.int32: - tensor.data_type = mace_pb2.DT_INT32 - tensor.int32_data.extend(tf_tensor.astype(np.int32).flat) - else: - raise Exception("Not supported tensor type: " + tf_dt.name) - self.resolved_ops[op.name] = 1 - - def check_winograd_conv(self, op): - filter_shape = get_input_tensor(op, 1).shape.as_list() - strides = op.get_attr('strides')[1:3] - output_shape = op.outputs[0].shape.as_list() - if len(output_shape) == 0 or output_shape[0] is None: - return False - width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) - return self.winograd and op.type != 'DepthwiseConv2dNative' and self.device == 'gpu' and \ - filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \ + def __init__(self, tf_ops, net_def, dt, device, winograd): + self.net_def = net_def + self.tf_ops = tf_ops + self.dt = dt + self.device = device + self.winograd = winograd + self.tf_graph = {} + self.tf_parents = {} + self.resolved_ops = {} + self.unused_tensor = set() + self.transpose_filter_tensor = {} + self.reshape_tensor = {} + self.ops = {} + + for op in tf_ops: + self.ops[op.name] = op + + for op in tf_ops: + self.resolved_ops[op.name] = 0 + for input in op.inputs: + input_name = input.name[:-2] + if input_name not in self.tf_graph: + self.tf_graph[input_name] = [] + self.tf_graph[input_name].append(op) + if op.name not in self.tf_parents: + self.tf_parents[op.name] = [] + self.tf_parents[op.name].append(self.ops[input_name]) + + def add_buffer_to_image(self, input_name, input_type): + output_name = input_name[:-2] + "_b2i" + input_name[-2:] + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'BufferToImage' + op_def.input.extend([input_name]) + op_def.output.extend([output_name]) + + arg = op_def.arg.add() + arg.name = 'buffer_type' + arg.i = buffer_type_map[input_type] + arg = op_def.arg.add() + arg.name = 'mode' + arg.i = 0 + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + return output_name + + def add_image_to_buffer(self, input_name, input_type): + output_name = input_name[:-2] + "_i2b" + input_name[-2:] + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'ImageToBuffer' + op_def.input.extend([input_name]) + op_def.output.extend([output_name]) + + arg = op_def.arg.add() + arg.name = 'buffer_type' + arg.i = buffer_type_map[input_type] + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + return output_name + + def add_gpu_input_transform(self, names): + for name in names: + new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = name + op_def.type = 'BufferToImage' + op_def.input.extend([new_input_name]) + op_def.output.extend([name + ':0']) + + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'buffer_type' + epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] + + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + def add_neon_input_transform(self, names): + for name in names: + new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = name + op_def.type = 'Transpose' + op_def.input.extend([new_input_name]) + op_def.output.extend([name + ':0']) + + dims_arg = op_def.arg.add() + dims_arg.name = 'dims' + dims_arg.ints.extend([0, 3, 1, 2]) + + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + def add_gpu_output_transform(self, names): + for name in names: + output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'ImageToBuffer' + op_def.input.extend([name + ':0']) + op_def.output.extend([output_name]) + + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'buffer_type' + epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] + + def add_neon_output_transform(self, names): + for name in names: + output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'Transpose' + op_def.input.extend([name + ':0']) + op_def.output.extend([output_name]) + + dims_arg = op_def.arg.add() + dims_arg.name = 'dims' + dims_arg.ints.extend([0, 2, 3, 1]) + + @staticmethod + def add_output_shape(outputs, op): + output_shapes = [] + for output in outputs: + output_shape = mace_pb2.OutputShape() + if isinstance(output, list): + output_shape.dims.extend(output) + elif isinstance(output, tf.Tensor): + if output.shape.num_elements() is not None: + output_shape.dims.extend(output.shape.as_list()) + else: + raise ValueError('output type not supported: ', type(output)) + output_shapes.append(output_shape) + op.output_shape.extend(output_shapes) + + def add_tensor(self, name, shape, tf_dt, value): + tensor = self.net_def.tensors.add() + tensor.name = name + + shape = list(shape) + tensor.dims.extend(shape) + + if tf_dt == tf.float32: + tensor.data_type = mace_pb2.DT_FLOAT + tensor.float_data.extend(value.flat) + elif tf_dt == tf.int32: + tensor.data_type = mace_pb2.DT_INT32 + tensor.int32_data.extend(value.flat) + else: + raise Exception("Not supported tensor type: " + tf_dt.name) + + def convert_reshape(self, op): + input_tensor = get_input_tensor(op, 0) + shape_tensor = get_input_tensor(op, 1) + shape_value = shape_tensor.eval().astype(np.int32) + self.unused_tensor.add(shape_tensor.name) + self.reshape_tensor[input_tensor.name] = shape_value + self.resolved_ops[op.name] = 1 + + def convert_tensor(self, op): + output_name = op.outputs[0].name + if output_name not in self.unused_tensor: + tensor = self.net_def.tensors.add() + tf_tensor = op.outputs[0].eval() + if output_name in self.transpose_filter_tensor: + tf_tensor = tf_tensor.transpose( + self.transpose_filter_tensor[output_name]) + if output_name in self.reshape_tensor: + tf_tensor = tf_tensor.reshape(self.reshape_tensor[output_name]) + tensor.name = op.outputs[0].name + + shape = list(tf_tensor.shape) + tensor.dims.extend(shape) + + tf_dt = op.get_attr('dtype') + if tf_dt == tf.float32: + tensor.data_type = mace_pb2.DT_FLOAT + tensor.float_data.extend(tf_tensor.astype(np.float32).flat) + elif tf_dt == tf.int32: + tensor.data_type = mace_pb2.DT_INT32 + tensor.int32_data.extend(tf_tensor.astype(np.int32).flat) + else: + raise Exception("Not supported tensor type: " + tf_dt.name) + self.resolved_ops[op.name] = 1 + + def check_winograd_conv(self, op): + filter_shape = get_input_tensor(op, 1).shape.as_list() + strides = op.get_attr('strides')[1:3] + output_shape = op.outputs[0].shape.as_list() + if len(output_shape) == 0 or output_shape[0] is None: + return False + width = output_shape[0] * ((output_shape[1] + 1) / 2) * (( + output_shape[2] + 1) / 2) + return self.winograd and op.type != 'DepthwiseConv2dNative' and \ + self.device == 'gpu' and filter_shape[0] == 3 and \ + (filter_shape[0] == filter_shape[1]) and \ (strides[0] == 1) and (strides[0] == strides[1]) and \ (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ (width < OPENCL_IMAGE_MAX_SIZE) - def convert_winograd_conv(self, op): - filter_tensor = get_input_tensor(op, 1) - filter_shape = filter_tensor.shape.as_list() - output_shape = op.outputs[0].shape.as_list() - - self.transpose_filter_tensor[filter_tensor.name] = (3, 2, 0, 1) - filter_name = self.add_buffer_to_image(op.inputs[1].name, "WINOGRAD_FILTER") - - # Input transform - wt_op = mace_pb2.OperatorDef() - arg = wt_op.arg.add() - arg.name = 'T' - arg.i = self.dt - padding_arg = wt_op.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode[op.get_attr('padding')] - wt_op.name = op.name + '_input_transform' - wt_op.type = 'WinogradTransform' - wt_op.input.extend([op.inputs[0].name]) - wt_output_name = wt_op.name + ":0" - wt_op.output.extend([wt_output_name]) - wt_output_shape = mace_pb2.OutputShape() - wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) - wt_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1]) - wt_op.output_shape.extend([wt_output_shape]) - - # MatMul - matmul_op = mace_pb2.OperatorDef() - arg = matmul_op.arg.add() - arg.name = 'T' - arg.i = self.dt - matmul_op.name = op.name + '_matmul' - matmul_op.type = 'MatMul' - matmul_op.input.extend([filter_name, wt_output_name]) - matmul_output_name = matmul_op.name + ":0" - matmul_op.output.extend([matmul_output_name]) - matmul_output_shape = mace_pb2.OutputShape() - matmul_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1]) - matmul_op.output_shape.extend([matmul_output_shape]) - - # Inverse transform - iwt_op = mace_pb2.OperatorDef() - arg = iwt_op.arg.add() - arg.name = 'T' - arg.i = self.dt - batch_arg = iwt_op.arg.add() - batch_arg.name = 'batch' - batch_arg.i = output_shape[0] - height_arg = iwt_op.arg.add() - height_arg.name = 'height' - height_arg.i = output_shape[1] - width_arg = iwt_op.arg.add() - width_arg.name = 'width' - width_arg.i = output_shape[2] - iwt_op.name = op.name + '_inverse_transform' - iwt_op.type = 'WinogradInverseTransform' - iwt_op.input.extend([matmul_output_name]) - - final_op = op - self.resolved_ops[op.name] = 1 - - if len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd' : - bias_add_op = self.tf_graph[op.name][0] - output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT") - iwt_op.input.extend([output_name]) - final_op = bias_add_op - self.resolved_ops[bias_add_op.name] = 1 - - if len(self.tf_graph[final_op.name]) == 1 \ - and self.tf_graph[final_op.name][0].type in activation_name_map: - activation_op = self.tf_graph[final_op.name][0] - fused_act_arg = iwt_op.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - if activation_op.type == 'Relu6': - max_limit_arg = iwt_op.arg.add() - max_limit_arg.name = 'max_limit' - max_limit_arg.f = 6 - final_op = activation_op - self.resolved_ops[activation_op.name] = 1 - - iwt_op.output.extend([output.name for output in final_op.outputs]) - self.add_output_shape(final_op.outputs, iwt_op) - self.net_def.op.extend([wt_op, matmul_op, iwt_op]) - - - def convert_conv2d(self, op): - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - if op.type == 'DepthwiseConv2dNative': - op_def.type = 'DepthwiseConv2d' - if self.device == 'neon': - self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (3, 2, 0, 1) - else: - op_def.type = op.type - if self.device == 'neon': - self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (3, 2, 0, 1) - else: - self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (0, 1, 3, 2) - if self.device == 'gpu': - op_def.input.extend([op.inputs[0].name]) - buffer_type = "DW_CONV2D_FILTER" if op_def.type == 'DepthwiseConv2d' else "CONV2D_FILTER" - output_name = self.add_buffer_to_image(get_input_tensor(op, 1).name, buffer_type) - op_def.input.extend([output_name]) - else: - op_def.input.extend([get_input_tensor(op, i).name for i in range(len(op.inputs))]) - - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode[op.get_attr('padding')] - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend(op.get_attr('strides')[1:3]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'neon': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - final_op = op - self.resolved_ops[op.name] = 1 - - if len(self.tf_graph.get(op.name, [])) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd': - bias_add_op = self.tf_graph[op.name][0] - if self.device == 'gpu': - output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([get_input_tensor(bias_add_op, 1).name]) - final_op = bias_add_op - self.resolved_ops[bias_add_op.name] = 1 - - if len(self.tf_graph.get(final_op.name, [])) == 1 \ - and self.tf_graph[final_op.name][0].type in activation_name_map: - activation_op = self.tf_graph[final_op.name][0] - if op_def.type == "Conv2D": - op_def.type = "FusedConv2D" - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - if activation_op.type == 'Relu6': - max_limit_arg = op_def.arg.add() - max_limit_arg.name = 'max_limit' - max_limit_arg.f = 6 - final_op = activation_op - self.resolved_ops[activation_op.name] = 1 - - op_def.output.extend([output.name for output in final_op.outputs]) - self.add_output_shape(final_op.outputs, op_def) - self.net_def.op.extend([op_def]) - - def convert_fused_batchnorm(self, op): - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'neon': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - op_def.name = op.name - op_def.type = 'FoldedBatchNorm' - - gamma_tensor = get_input_tensor(op, 1) - for i in range(1, 5): - input_tensor = get_input_tensor(op, i) - assert input_tensor.shape == gamma_tensor.shape - self.unused_tensor.add(input_tensor.name) - - gamma_value = get_input_tensor(op, 1).eval().astype(np.float32) - beta_value = get_input_tensor(op, 2).eval().astype(np.float32) - mean_value = get_input_tensor(op, 3).eval().astype(np.float32) - var_value = get_input_tensor(op, 4).eval().astype(np.float32) - epsilon_value = op.get_attr('epsilon') - - scale_value = ( - (1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) * - gamma_value) - offset_value = (-mean_value * scale_value) + beta_value - idx = gamma_tensor.name.rfind('/') - name_prefix = gamma_tensor.name[:idx] + '/' - input_names = [name_prefix+'scale:0', name_prefix+'offset:0'] - self.add_tensor(input_names[0], gamma_value.shape, - gamma_tensor.dtype, scale_value) - self.add_tensor(input_names[1], gamma_value.shape, - gamma_tensor.dtype, offset_value) - - op_def.input.extend([op.inputs[0].name]) - if self.device == 'gpu': - for name in input_names: - output_name = self.add_buffer_to_image(name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([name for name in input_names]) - - self.resolved_ops[op.name] = 1 - final_op = op - - if len(self.tf_graph[op.name]) == 1 \ - and self.tf_graph[op.name][0].type in activation_name_map: - activation_op = self.tf_graph[op.name][0] - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - if activation_op.type == 'Relu6': + def convert_winograd_conv(self, op): + filter_tensor = get_input_tensor(op, 1) + filter_shape = filter_tensor.shape.as_list() + output_shape = op.outputs[0].shape.as_list() + + self.transpose_filter_tensor[filter_tensor.name] = (3, 2, 0, 1) + filter_name = self.add_buffer_to_image(op.inputs[1].name, + "WINOGRAD_FILTER") + + # Input transform + wt_op = mace_pb2.OperatorDef() + arg = wt_op.arg.add() + arg.name = 'T' + arg.i = self.dt + padding_arg = wt_op.arg.add() + padding_arg.name = 'padding' + padding_arg.i = padding_mode[op.get_attr('padding')] + wt_op.name = op.name + '_input_transform' + wt_op.type = 'WinogradTransform' + wt_op.input.extend([op.inputs[0].name]) + wt_output_name = wt_op.name + ":0" + wt_op.output.extend([wt_output_name]) + wt_output_shape = mace_pb2.OutputShape() + wt_output_width = output_shape[0] * ((output_shape[1] + 1) / 2) * (( + output_shape[2] + 1) / 2) + wt_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1]) + wt_op.output_shape.extend([wt_output_shape]) + + # MatMul + matmul_op = mace_pb2.OperatorDef() + arg = matmul_op.arg.add() + arg.name = 'T' + arg.i = self.dt + matmul_op.name = op.name + '_matmul' + matmul_op.type = 'MatMul' + matmul_op.input.extend([filter_name, wt_output_name]) + matmul_output_name = matmul_op.name + ":0" + matmul_op.output.extend([matmul_output_name]) + matmul_output_shape = mace_pb2.OutputShape() + matmul_output_shape.dims.extend( + [16, filter_shape[3], wt_output_width, 1]) + matmul_op.output_shape.extend([matmul_output_shape]) + + # Inverse transform + iwt_op = mace_pb2.OperatorDef() + arg = iwt_op.arg.add() + arg.name = 'T' + arg.i = self.dt + batch_arg = iwt_op.arg.add() + batch_arg.name = 'batch' + batch_arg.i = output_shape[0] + height_arg = iwt_op.arg.add() + height_arg.name = 'height' + height_arg.i = output_shape[1] + width_arg = iwt_op.arg.add() + width_arg.name = 'width' + width_arg.i = output_shape[2] + iwt_op.name = op.name + '_inverse_transform' + iwt_op.type = 'WinogradInverseTransform' + iwt_op.input.extend([matmul_output_name]) + + final_op = op + self.resolved_ops[op.name] = 1 + + if len(self.tf_graph[op.name] + ) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd': + bias_add_op = self.tf_graph[op.name][0] + output_name = self.add_buffer_to_image( + get_input_tensor(bias_add_op, 1).name, "ARGUMENT") + iwt_op.input.extend([output_name]) + final_op = bias_add_op + self.resolved_ops[bias_add_op.name] = 1 + + if len(self.tf_graph[final_op.name]) == 1 and \ + self.tf_graph[final_op.name][0].type in activation_name_map: + activation_op = self.tf_graph[final_op.name][0] + fused_act_arg = iwt_op.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + if activation_op.type == 'Relu6': + max_limit_arg = iwt_op.arg.add() + max_limit_arg.name = 'max_limit' + max_limit_arg.f = 6 + final_op = activation_op + self.resolved_ops[activation_op.name] = 1 + + iwt_op.output.extend([output.name for output in final_op.outputs]) + self.add_output_shape(final_op.outputs, iwt_op) + self.net_def.op.extend([wt_op, matmul_op, iwt_op]) + + def convert_conv2d(self, op): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + if op.type == 'DepthwiseConv2dNative': + op_def.type = 'DepthwiseConv2d' + if self.device == 'neon': + self.transpose_filter_tensor[get_input_tensor( + op, 1).name] = (3, 2, 0, 1) + else: + op_def.type = op.type + if self.device == 'neon': + self.transpose_filter_tensor[get_input_tensor( + op, 1).name] = (3, 2, 0, 1) + else: + self.transpose_filter_tensor[get_input_tensor( + op, 1).name] = (0, 1, 3, 2) + if self.device == 'gpu': + op_def.input.extend([op.inputs[0].name]) + if op_def.type == 'DepthwiseConv2d': + buffer_type = "DW_CONV2D_FILTER" + else: + buffer_type = "CONV2D_FILTER" + output_name = self.add_buffer_to_image( + get_input_tensor(op, 1).name, buffer_type) + op_def.input.extend([output_name]) + else: + op_def.input.extend( + [get_input_tensor(op, i).name for i in range(len(op.inputs))]) + + padding_arg = op_def.arg.add() + padding_arg.name = 'padding' + padding_arg.i = padding_mode[op.get_attr('padding')] + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend(op.get_attr('strides')[1:3]) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' + final_op = op + self.resolved_ops[op.name] = 1 + + if len(self.tf_graph.get(op.name, [])) == 1 and \ + self.tf_graph[op.name][0].type == 'BiasAdd': + bias_add_op = self.tf_graph[op.name][0] + if self.device == 'gpu': + output_name = self.add_buffer_to_image( + get_input_tensor(bias_add_op, 1).name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([get_input_tensor(bias_add_op, 1).name]) + final_op = bias_add_op + self.resolved_ops[bias_add_op.name] = 1 + + if len(self.tf_graph.get(final_op.name, [])) == 1 and \ + self.tf_graph[final_op.name][0].type in activation_name_map: + activation_op = self.tf_graph[final_op.name][0] + if op_def.type == "Conv2D": + op_def.type = "FusedConv2D" + fused_act_arg = op_def.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + if activation_op.type == 'Relu6': + max_limit_arg = op_def.arg.add() + max_limit_arg.name = 'max_limit' + max_limit_arg.f = 6 + final_op = activation_op + self.resolved_ops[activation_op.name] = 1 + + op_def.output.extend([output.name for output in final_op.outputs]) + self.add_output_shape(final_op.outputs, op_def) + self.net_def.op.extend([op_def]) + + def convert_fused_batchnorm(self, op): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' + op_def.name = op.name + op_def.type = 'FoldedBatchNorm' + + gamma_tensor = get_input_tensor(op, 1) + for i in range(1, 5): + input_tensor = get_input_tensor(op, i) + assert input_tensor.shape == gamma_tensor.shape + self.unused_tensor.add(input_tensor.name) + + gamma_value = get_input_tensor(op, 1).eval().astype(np.float32) + beta_value = get_input_tensor(op, 2).eval().astype(np.float32) + mean_value = get_input_tensor(op, 3).eval().astype(np.float32) + var_value = get_input_tensor(op, 4).eval().astype(np.float32) + epsilon_value = op.get_attr('epsilon') + + scale_value = ((1.0 / np.vectorize(math.sqrt) + (var_value + epsilon_value)) * gamma_value) + offset_value = (-mean_value * scale_value) + beta_value + idx = gamma_tensor.name.rfind('/') + name_prefix = gamma_tensor.name[:idx] + '/' + input_names = [name_prefix + 'scale:0', name_prefix + 'offset:0'] + self.add_tensor(input_names[0], gamma_value.shape, gamma_tensor.dtype, + scale_value) + self.add_tensor(input_names[1], gamma_value.shape, gamma_tensor.dtype, + offset_value) + + op_def.input.extend([op.inputs[0].name]) + if self.device == 'gpu': + for name in input_names: + output_name = self.add_buffer_to_image(name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([name for name in input_names]) + + self.resolved_ops[op.name] = 1 + final_op = op + + if len(self.tf_graph[op.name]) == 1 \ + and self.tf_graph[op.name][0].type in activation_name_map: + activation_op = self.tf_graph[op.name][0] + fused_act_arg = op_def.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + if activation_op.type == 'Relu6': + max_limit_arg = op_def.arg.add() + max_limit_arg.name = 'max_limit' + max_limit_arg.f = 6 + final_op = activation_op + self.resolved_ops[activation_op.name] = 1 + + op_def.output.extend([final_op.outputs[0].name]) + self.add_output_shape([final_op.outputs[0]], op_def) + + self.net_def.op.extend([op_def]) + + def convert_batchnorm(self, op): + bn_ops = [] + bn_ops.append(op) + for i in range(1, 3): + if len(self.tf_graph[bn_ops[i-1].name]) == 1 and \ + self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]: + bn_ops.append(self.tf_graph[bn_ops[i - 1].name][0]) + else: + raise Exception('Invalid BatchNorm Op') + if len(self.tf_graph[bn_ops[2].name]) == 2 and \ + self.tf_graph[bn_ops[2].name][0].type == \ + BATCH_NORM_ORDER[3] and \ + self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]: + bn_ops.append(self.tf_graph[bn_ops[2].name][0]) + bn_ops.append(self.tf_graph[bn_ops[2].name][1]) + else: + raise Exception('Invalid BatchNorm Op') + bn_ops.append(self.tf_graph[bn_ops[4].name][0]) + bn_ops.append(self.tf_graph[bn_ops[3].name][0]) + + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + input_name = get_input_tensor(bn_ops[3], 0).name + gamma = get_input_tensor(bn_ops[2], 1).name + beta = get_input_tensor(bn_ops[5], 0).name + mean = get_input_tensor(bn_ops[4], 0).name + variance = get_input_tensor(bn_ops[0], 0).name + + op_def.name = op.name[:-4] # remove /add + op_def.type = 'BatchNorm' + if self.device == 'gpu': + op_def.input.extend([input_name]) + for tensor_name in [gamma, beta, mean, variance]: + output_name = self.add_buffer_to_image(tensor_name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([input_name, gamma, beta, mean, variance]) + op_def.output.extend([output.name for output in bn_ops[6].outputs]) + self.add_output_shape(bn_ops[6].outputs, op_def) + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'epsilon' + epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' + self.unused_tensor.add(get_input_tensor(op, 1).name) + + self.net_def.op.extend([op_def]) + for i in range(0, 7): + self.resolved_ops[bn_ops[i].name] = 1 + + def convert_pooling(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = 'Pooling' + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + pooling_type_arg = op_def.arg.add() + pooling_type_arg.name = 'pooling_type' + pooling_type_arg.i = pooling_type_mode[op.type] + padding_arg = op_def.arg.add() + padding_arg.name = 'padding' + padding_arg.i = padding_mode[op.get_attr('padding')] + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend(op.get_attr('strides')[1:3]) + kernels_arg = op_def.arg.add() + kernels_arg.name = 'kernels' + kernels_arg.ints.extend(op.get_attr('ksize')[1:3]) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' + self.resolved_ops[op.name] = 1 + + def convert_global_avg_pooling(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = 'Pooling' + op_def.input.extend([op.inputs[0].name]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + pooling_type_arg = op_def.arg.add() + pooling_type_arg.name = 'pooling_type' + pooling_type_arg.i = pooling_type_mode['AvgPool'] + padding_arg = op_def.arg.add() + padding_arg.name = 'padding' + padding_arg.i = padding_mode['VALID'] + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend([1, 1]) + kernels_arg = op_def.arg.add() + kernels_arg.name = 'kernels' + kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3]) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' + self.resolved_ops[op.name] = 1 + + def convert_activation(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = 'Activation' + activation_arg = op_def.arg.add() + activation_arg.name = 'activation' + activation_arg.s = activation_name_map[op.type] + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + + def convert_relu6(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = 'Activation' + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + activation_arg = op_def.arg.add() + activation_arg.name = 'activation' + activation_arg.s = "RELUX" max_limit_arg = op_def.arg.add() max_limit_arg.name = 'max_limit' max_limit_arg.f = 6 - final_op = activation_op - self.resolved_ops[activation_op.name] = 1 - - op_def.output.extend([final_op.outputs[0].name]) - self.add_output_shape([final_op.outputs[0]], op_def) - - self.net_def.op.extend([op_def]) - - def convert_batchnorm(self, op): - bn_ops = [] - bn_ops.append(op) - for i in range(1, 3): - if len(self.tf_graph[bn_ops[i-1].name]) == 1 \ - and self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]: - bn_ops.append(self.tf_graph[bn_ops[i-1].name][0]) - else: - raise Exception('Invalid BatchNorm Op') - if len(self.tf_graph[bn_ops[2].name]) == 2 \ - and self.tf_graph[bn_ops[2].name][0].type == BATCH_NORM_ORDER[3] \ - and self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]: - bn_ops.append(self.tf_graph[bn_ops[2].name][0]) - bn_ops.append(self.tf_graph[bn_ops[2].name][1]) - else: - raise Exception('Invalid BatchNorm Op') - bn_ops.append(self.tf_graph[bn_ops[4].name][0]) - bn_ops.append(self.tf_graph[bn_ops[3].name][0]) - - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - - input_name = get_input_tensor(bn_ops[3], 0).name - gamma = get_input_tensor(bn_ops[2], 1).name - beta = get_input_tensor(bn_ops[5], 0).name - mean = get_input_tensor(bn_ops[4], 0).name - variance = get_input_tensor(bn_ops[0], 0).name - - op_def.name = op.name[:-4] # remove /add - op_def.type = 'BatchNorm' - if self.device == 'gpu': - op_def.input.extend([input_name]) - for tensor_name in [gamma, beta, mean, variance]: - output_name = self.add_buffer_to_image(tensor_name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([input_name, gamma, beta, mean, variance]) - op_def.output.extend([output.name for output in bn_ops[6].outputs]) - self.add_output_shape(bn_ops[6].outputs, op_def) - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'epsilon' - epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'neon': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - self.unused_tensor.add(get_input_tensor(op, 1).name) - - self.net_def.op.extend([op_def]) - for i in range(0, 7): - self.resolved_ops[bn_ops[i].name] = 1 - - def convert_pooling(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = 'Pooling' - op_def.input.extend([input.name for input in op.inputs]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - pooling_type_arg = op_def.arg.add() - pooling_type_arg.name = 'pooling_type' - pooling_type_arg.i = pooling_type_mode[op.type] - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode[op.get_attr('padding')] - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend(op.get_attr('strides')[1:3]) - kernels_arg = op_def.arg.add() - kernels_arg.name = 'kernels' - kernels_arg.ints.extend(op.get_attr('ksize')[1:3]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'neon': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - self.resolved_ops[op.name] = 1 - - def convert_global_avg_pooling(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = 'Pooling' - op_def.input.extend([op.inputs[0].name]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - pooling_type_arg = op_def.arg.add() - pooling_type_arg.name = 'pooling_type' - pooling_type_arg.i = pooling_type_mode['AvgPool'] - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode['VALID'] - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend([1, 1]) - kernels_arg = op_def.arg.add() - kernels_arg.name = 'kernels' - kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'neon': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - self.resolved_ops[op.name] = 1 - - def convert_activation(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = 'Activation' - activation_arg = op_def.arg.add() - activation_arg.name = 'activation' - activation_arg.s = activation_name_map[op.type] - op_def.input.extend([input.name for input in op.inputs]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - - def convert_relu6(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = 'Activation' - op_def.input.extend([input.name for input in op.inputs]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - activation_arg = op_def.arg.add() - activation_arg.name = 'activation' - activation_arg.s = "RELUX" - max_limit_arg = op_def.arg.add() - max_limit_arg.name = 'max_limit' - max_limit_arg.f = 6 - self.resolved_ops[op.name] = 1 - - def convert_add(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = "AddN" - op_def.input.extend([input.name for input in op.inputs]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - - def convert_concat(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = "Concat" - op_def.input.extend([input.name for input in op.inputs[:-1]]) - op_def.output.extend([output.name for output in op.outputs]) - axis_arg = op_def.arg.add() - axis_arg.name = 'axis' - axis = get_input_tensor(op, len(op.inputs) - 1).eval().astype(np.int32) - if self.device == 'neon' and axis == 3: - axis = 1 - axis_arg.i = axis - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - self.unused_tensor.add(get_input_tensor(op, len(op.inputs) - 1).name) - - def convert_resize_bilinear(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = "ResizeBilinear" - op_def.input.extend([op.inputs[0].name]) - op_def.output.extend([output.name for output in op.outputs]) - size_arg = op_def.arg.add() - size_arg.name = 'size' - size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat) - size_arg = op_def.arg.add() - size_arg.name = 'align_corners' - size_arg.i = op.get_attr('align_corners') - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - self.unused_tensor.add(get_input_tensor(op, 1).name) - - def convert_math(self, op, math_type): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - - if len(op.inputs) == 1: - op_def.type = "CWise" - op_def.input.extend([input.name for input in op.inputs]) - x_arg = op_def.arg.add() - x_arg.name = 'x' - x_arg.f = 0 - elif len(op.inputs) >= 2: - input_tensor0 = get_input_tensor(op, 0) - input_tensor1 = get_input_tensor(op, 1) - if input_tensor0.shape == input_tensor1.shape: - op_def.type = "Eltwise" + self.resolved_ops[op.name] = 1 + + def convert_add(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = "AddN" op_def.input.extend([input.name for input in op.inputs]) - else: - op_def.type = "CWise" - x_value = 0 - if len(input_tensor1.shape)==4: - op_def.input.extend([op.inputs[1].name]) - x_value = get_input_tensor(op, 0).eval().astype(np.float32) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + + def convert_concat(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = "Concat" + op_def.input.extend([input.name for input in op.inputs[:-1]]) + op_def.output.extend([output.name for output in op.outputs]) + axis_arg = op_def.arg.add() + axis_arg.name = 'axis' + axis = get_input_tensor(op, len(op.inputs) - 1).eval().astype(np.int32) + if self.device == 'neon' and axis == 3: + axis = 1 + axis_arg.i = axis + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + self.unused_tensor.add(get_input_tensor(op, len(op.inputs) - 1).name) + + def convert_resize_bilinear(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = "ResizeBilinear" + op_def.input.extend([op.inputs[0].name]) + op_def.output.extend([output.name for output in op.outputs]) + size_arg = op_def.arg.add() + size_arg.name = 'size' + size_arg.ints.extend( + get_input_tensor(op, 1).eval().astype(np.int32).flat) + size_arg = op_def.arg.add() + size_arg.name = 'align_corners' + size_arg.i = op.get_attr('align_corners') + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + self.unused_tensor.add(get_input_tensor(op, 1).name) + + def convert_math(self, op, math_type): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + + if len(op.inputs) == 1: + op_def.type = "CWise" + op_def.input.extend([input.name for input in op.inputs]) + x_arg = op_def.arg.add() + x_arg.name = 'x' + x_arg.f = 0 + elif len(op.inputs) >= 2: + input_tensor0 = get_input_tensor(op, 0) + input_tensor1 = get_input_tensor(op, 1) + if input_tensor0.shape == input_tensor1.shape: + op_def.type = "Eltwise" + op_def.input.extend([input.name for input in op.inputs]) + else: + op_def.type = "CWise" + x_value = 0 + if len(input_tensor1.shape) == 4: + op_def.input.extend([op.inputs[1].name]) + x_value = get_input_tensor(op, 0).eval().astype(np.float32) + else: + op_def.input.extend([op.inputs[0].name]) + x_value = get_input_tensor(op, 1).eval().astype(np.float32) + x_arg = op_def.arg.add() + x_arg.name = 'x' + x_arg.f = x_value + type_arg = op_def.arg.add() + type_arg.name = 'type' + type_arg.i = math_type_mode[math_type] + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + + def convert_depth_to_space(self, op, d2s): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = op.type + op_def.input.extend([op.inputs[0].name]) + op_def.output.extend([output.name for output in op.outputs]) + size_arg = op_def.arg.add() + size_arg.name = 'block_size' + size_arg.i = op.get_attr('block_size') + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + + def convert_bias_add(self, op): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = "BiasAdd" + op_def.input.extend([op.inputs[0].name]) + if self.device == 'gpu': + output_name = self.add_buffer_to_image( + get_input_tensor(op, 1).name, "ARGUMENT") + op_def.input.extend([output_name]) else: - op_def.input.extend([op.inputs[0].name]) - x_value = get_input_tensor(op, 1).eval().astype(np.float32) - x_arg = op_def.arg.add() - x_arg.name = 'x' - x_arg.f = x_value - type_arg = op_def.arg.add() - type_arg.name = 'type' - type_arg.i = math_type_mode[math_type] - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - - def convert_depth_to_space(self, op, d2s): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = op.type - op_def.input.extend([op.inputs[0].name]) - op_def.output.extend([output.name for output in op.outputs]) - size_arg = op_def.arg.add() - size_arg.name = 'block_size' - size_arg.i = op.get_attr('block_size') - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - - def convert_bias_add(self, op): - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = "BiasAdd" - op_def.input.extend([op.inputs[0].name]) - if self.device == 'gpu': - output_name = self.add_buffer_to_image(get_input_tensor(op, 1).name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([get_input_tensor(op, 1).name]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - self.net_def.op.extend([op_def]) - self.resolved_ops[op.name] = 1 - - def convert_space_to_batch(self, op, b2s): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = op.type - op_def.input.extend([op.inputs[0].name]) - op_def.output.extend([output.name for output in op.outputs]) - size_arg = op_def.arg.add() - size_arg.name = 'block_shape' - size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat) - size_arg = op_def.arg.add() - if b2s: - size_arg.name = 'crops' - else: - size_arg.name = 'paddings' - size_arg.ints.extend(get_input_tensor(op, 2).eval().astype(np.int32).flat) - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - self.unused_tensor.add(get_input_tensor(op, 1).name) - self.unused_tensor.add(get_input_tensor(op, 2).name) - - def is_atrous_conv2d(self, op): - return op.type == 'SpaceToBatchND' and\ - len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Conv2D' - - def convert_atrous_conv2d(self, op): - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - conv_op = self.tf_graph[op.name][0] - op_def.name = conv_op.name - op_def.type = conv_op.type - self.transpose_filter_tensor[get_input_tensor(conv_op, 1).name] = (0, 1, 3, 2) - if self.device == 'gpu': - op_def.input.extend([op.inputs[0].name]) - output_name = self.add_buffer_to_image(get_input_tensor(conv_op, 1).name, "CONV2D_FILTER") - op_def.input.extend([output_name]) - else: - op_def.input.extend([get_input_tensor(op, 0).name]) - op_def.input.extend([get_input_tensor(conv_op, 1).name]) - - dilation_arg = op_def.arg.add() - dilation_arg.name = 'dilations' - dilation_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat) - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat - if len(padding_values) > 0 and padding_values[0] > 0: - padding_arg.i = padding_mode['SAME'] - else: - padding_arg.i = padding_mode['VALID'] - self.unused_tensor.add(get_input_tensor(op, 1).name) - self.unused_tensor.add(get_input_tensor(op, 2).name) - - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend([1, 1]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'neon': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - final_op = conv_op - self.resolved_ops[op.name] = 1 - self.resolved_ops[conv_op.name] = 1 - - if len(self.tf_graph[final_op.name]) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd' : - bias_add_op = self.tf_graph[final_op.name][0] - if self.device == 'gpu': - output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([get_input_tensor(bias_add_op, 1).name]) - final_op = bias_add_op - self.resolved_ops[bias_add_op.name] = 1 - - if len(self.tf_graph[final_op.name]) == 1 \ - and self.tf_graph[final_op.name][0].type == 'BatchToSpaceND': - final_op = self.tf_graph[final_op.name][0] - self.resolved_ops[final_op.name] = 1 - self.unused_tensor.add(get_input_tensor(final_op, 1).name) - self.unused_tensor.add(get_input_tensor(final_op, 2).name) - else: - raise Exception('Convert atrous conv error: no BatchToSpaceND op') - - if len(self.tf_graph[final_op.name]) == 1 \ - and self.tf_graph[final_op.name][0].type == 'Relu': - relu_op = self.tf_graph[final_op.name][0] - op_def.type = "FusedConv2D" - fused_relu_arg = op_def.arg.add() - fused_relu_arg.name = 'activation' - fused_relu_arg.s = "RELU" - final_op = relu_op - self.resolved_ops[relu_op.name] = 1 - - op_def.output.extend([output.name for output in final_op.outputs]) - self.add_output_shape(final_op.outputs, op_def) - self.net_def.op.extend([op_def]) - - def is_softmax(self, op): - return op.type == 'Softmax' and \ - len(self.tf_parents[op.name]) == 1 and self.tf_parents[op.name][0].type == 'Reshape' and \ - len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Reshape' - - def convert_softmax(self, softmax_op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - - # deal with first Reshape op - parent_reshape_op = self.tf_parents[softmax_op.name][0] - self.unused_tensor.add(get_input_tensor(parent_reshape_op, 1).name) - self.resolved_ops[parent_reshape_op.name] = 1 - - # FIXME: hardcode for inception_v3 - # remove squeeze if exist - squeeze_op = self.tf_parents[parent_reshape_op.name][0] - if squeeze_op.type == 'Squeeze': - op_def.input.extend([squeeze_op.inputs[0].name]) - self.resolved_ops[squeeze_op.name] = 1 - # remove shape if exist - children_ops = self.tf_graph[squeeze_op.name] - print children_ops - if len(children_ops) > 1 and children_ops[0].type == 'Shape': - self.unused_tensor.add(get_input_tensor(children_ops[1], 0).name) - self.resolved_ops[children_ops[1].name] = 1 - else: - op_def.input.extend([parent_reshape_op.inputs[0].name]) - - # deal with Softmax op - op_def.name = softmax_op.name - op_def.type = softmax_op.type - self.resolved_ops[softmax_op.name] = 1 - - # deal with last Reshape op - reshape_op = self.tf_graph[softmax_op.name][0] - self.unused_tensor.add(get_input_tensor(reshape_op, 1).name) - - shape = [dim.value for dim in reshape_op.outputs[0].shape] - if len(shape) == 2: - shape = [1, 1, shape[0], shape[1]] - op_def.output.extend([output.name for output in reshape_op.outputs]) - self.add_output_shape([shape], op_def) - self.resolved_ops[reshape_op.name] = 1 - - def convert_normal_op(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = op.type - op_def.input.extend([input.name for input in op.inputs]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - - def replace_in_out_name(self, input_names, output_names): - in_names = set([input_name + ":0" for input_name in input_names]) - out_names = set([output_name + ":0" for output_name in output_names]) - for op in self.net_def.op: - if op.input[0] in in_names: - op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0] - if op.output[0] in out_names: - op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0] - - def convert(self, input_nodes, output_nodes): - if self.device == 'gpu': - self.add_gpu_input_transform(input_nodes) - if self.device == 'neon': - self.add_neon_input_transform(input_nodes) - - for op in self.tf_ops: - if self.resolved_ops[op.name] == 1: - continue - if op.type in ['Placeholder', 'Identity']: + op_def.input.extend([get_input_tensor(op, 1).name]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + self.net_def.op.extend([op_def]) self.resolved_ops[op.name] = 1 - pass - elif op.type == 'Const': - pass - elif op.type == 'Reshape': - self.convert_reshape(op) - elif self.is_atrous_conv2d(op): - self.convert_atrous_conv2d(op) - elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative': - if self.check_winograd_conv(op): - self.convert_winograd_conv(op) + + def convert_space_to_batch(self, op, b2s): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = op.type + op_def.input.extend([op.inputs[0].name]) + op_def.output.extend([output.name for output in op.outputs]) + size_arg = op_def.arg.add() + size_arg.name = 'block_shape' + size_arg.ints.extend( + get_input_tensor(op, 1).eval().astype(np.int32).flat) + size_arg = op_def.arg.add() + if b2s: + size_arg.name = 'crops' else: - self.convert_conv2d(op) - elif op.type == 'FusedBatchNorm': - self.convert_fused_batchnorm(op) - elif op.type == 'Add' and op.name.endswith('batchnorm/add'): - self.convert_batchnorm(op) - elif op.type == 'AvgPool' or op.type == 'MaxPool': - self.convert_pooling(op) - elif op.type == 'Relu6': - self.convert_relu6(op) - elif op.type == 'Add': - self.convert_add(op) - elif op.type == 'ConcatV2': - self.convert_concat(op) - elif op.type == 'ResizeBilinear': - self.convert_resize_bilinear(op) - elif op.type == 'BiasAdd': - self.convert_bias_add(op) - elif op.type == 'SpaceToBatchND': - self.convert_space_to_batch(op, False) - elif op.type == 'BatchToSpaceND': - self.convert_space_to_batch(op, True) - elif op.type == 'DepthToSpace': - self.convert_depth_to_space(op, True) - elif op.type == 'SpaceToDepth': - self.convert_depth_to_space(op, False) - elif op.type in ['Neg', 'neg', 'Negative', 'negative']: - self.convert_math(op, 'NEG') - elif op.type == 'Mul': - self.convert_math(op, 'MUL') - elif op.type == 'Sub': - self.convert_math(op, 'SUB') - elif self.is_softmax(op): - self.convert_softmax(op) - elif op.type in ['Relu', 'Sigmoid', 'Tanh']: - self.convert_activation(op) - # FIXME: hardcode for inception_v3 - elif op.type in ['Squeeze', 'Shape']: + size_arg.name = 'paddings' + size_arg.ints.extend( + get_input_tensor(op, 2).eval().astype(np.int32).flat) + self.add_output_shape(op.outputs, op_def) self.resolved_ops[op.name] = 1 - elif op.type == 'Mean': - # Global avg pooling - reduce_dims = op.inputs[1].eval() - if reduce_dims[0] == 1 and reduce_dims[1] == 2: - self.convert_global_avg_pooling(op) - self.unused_tensor.add(op.inputs[1].name) + self.unused_tensor.add(get_input_tensor(op, 1).name) + self.unused_tensor.add(get_input_tensor(op, 2).name) + + def is_atrous_conv2d(self, op): + return op.type == 'SpaceToBatchND' and \ + len(self.tf_graph[op.name]) == 1 and \ + self.tf_graph[op.name][0].type == 'Conv2D' + + def convert_atrous_conv2d(self, op): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + conv_op = self.tf_graph[op.name][0] + op_def.name = conv_op.name + op_def.type = conv_op.type + self.transpose_filter_tensor[get_input_tensor(conv_op, + 1).name] = (0, 1, 3, 2) + if self.device == 'gpu': + op_def.input.extend([op.inputs[0].name]) + output_name = self.add_buffer_to_image( + get_input_tensor(conv_op, 1).name, "CONV2D_FILTER") + op_def.input.extend([output_name]) + else: + op_def.input.extend([get_input_tensor(op, 0).name]) + op_def.input.extend([get_input_tensor(conv_op, 1).name]) + + dilation_arg = op_def.arg.add() + dilation_arg.name = 'dilations' + dilation_arg.ints.extend( + get_input_tensor(op, 1).eval().astype(np.int32).flat) + padding_arg = op_def.arg.add() + padding_arg.name = 'padding' + padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat + if len(padding_values) > 0 and padding_values[0] > 0: + padding_arg.i = padding_mode['SAME'] else: - raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) - #elif op.type in ['']: - # self.convert_normal_op(op) - else: - raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) + padding_arg.i = padding_mode['VALID'] + self.unused_tensor.add(get_input_tensor(op, 1).name) + self.unused_tensor.add(get_input_tensor(op, 2).name) + + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend([1, 1]) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' + final_op = conv_op + self.resolved_ops[op.name] = 1 + self.resolved_ops[conv_op.name] = 1 + + if len(self.tf_graph[final_op.name] + ) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd': + bias_add_op = self.tf_graph[final_op.name][0] + if self.device == 'gpu': + output_name = self.add_buffer_to_image( + get_input_tensor(bias_add_op, 1).name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([get_input_tensor(bias_add_op, 1).name]) + final_op = bias_add_op + self.resolved_ops[bias_add_op.name] = 1 + + if len(self.tf_graph[final_op.name]) == 1 and \ + self.tf_graph[final_op.name][0].type == 'BatchToSpaceND': + final_op = self.tf_graph[final_op.name][0] + self.resolved_ops[final_op.name] = 1 + self.unused_tensor.add(get_input_tensor(final_op, 1).name) + self.unused_tensor.add(get_input_tensor(final_op, 2).name) + else: + raise Exception('Convert atrous conv error: no BatchToSpaceND op') + + if len(self.tf_graph[final_op.name]) == 1 and \ + self.tf_graph[final_op.name][0].type == 'Relu': + relu_op = self.tf_graph[final_op.name][0] + op_def.type = "FusedConv2D" + fused_relu_arg = op_def.arg.add() + fused_relu_arg.name = 'activation' + fused_relu_arg.s = "RELU" + final_op = relu_op + self.resolved_ops[relu_op.name] = 1 + + op_def.output.extend([output.name for output in final_op.outputs]) + self.add_output_shape(final_op.outputs, op_def) + self.net_def.op.extend([op_def]) + + def is_softmax(self, op): + return op.type == 'Softmax' and \ + len(self.tf_parents[op.name]) == 1 and \ + self.tf_parents[op.name][0].type == 'Reshape' and \ + len(self.tf_graph[op.name]) == 1 and \ + self.tf_graph[op.name][0].type == 'Reshape' + + def convert_softmax(self, softmax_op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + # deal with first Reshape op + parent_reshape_op = self.tf_parents[softmax_op.name][0] + self.unused_tensor.add(get_input_tensor(parent_reshape_op, 1).name) + self.resolved_ops[parent_reshape_op.name] = 1 + + # FIXME: hardcode for inception_v3 + # remove squeeze if exist + squeeze_op = self.tf_parents[parent_reshape_op.name][0] + if squeeze_op.type == 'Squeeze': + op_def.input.extend([squeeze_op.inputs[0].name]) + self.resolved_ops[squeeze_op.name] = 1 + # remove shape if exist + children_ops = self.tf_graph[squeeze_op.name] + print children_ops + if len(children_ops) > 1 and children_ops[0].type == 'Shape': + self.unused_tensor.add( + get_input_tensor(children_ops[1], 0).name) + self.resolved_ops[children_ops[1].name] = 1 + else: + op_def.input.extend([parent_reshape_op.inputs[0].name]) + + # deal with Softmax op + op_def.name = softmax_op.name + op_def.type = softmax_op.type + self.resolved_ops[softmax_op.name] = 1 + + # deal with last Reshape op + reshape_op = self.tf_graph[softmax_op.name][0] + self.unused_tensor.add(get_input_tensor(reshape_op, 1).name) + + shape = [dim.value for dim in reshape_op.outputs[0].shape] + if len(shape) == 2: + shape = [1, 1, shape[0], shape[1]] + op_def.output.extend([output.name for output in reshape_op.outputs]) + self.add_output_shape([shape], op_def) + self.resolved_ops[reshape_op.name] = 1 + + def convert_normal_op(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = op.type + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 - for op in self.tf_ops: - if self.resolved_ops[op.name] == 1: - continue - elif op.type == 'Const': - self.convert_tensor(op) - else: - raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) + def replace_in_out_name(self, input_names, output_names): + in_names = set([input_name + ":0" for input_name in input_names]) + out_names = set([output_name + ":0" for output_name in output_names]) + for op in self.net_def.op: + if op.input[0] in in_names: + op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0] + if op.output[0] in out_names: + op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0] - if self.device == 'gpu': - self.add_gpu_output_transform(output_nodes) + def convert(self, input_nodes, output_nodes): + if self.device == 'gpu': + self.add_gpu_input_transform(input_nodes) + if self.device == 'neon': + self.add_neon_input_transform(input_nodes) + + for op in self.tf_ops: + if self.resolved_ops[op.name] == 1: + continue + if op.type in ['Placeholder', 'Identity']: + self.resolved_ops[op.name] = 1 + pass + elif op.type == 'Const': + pass + elif op.type == 'Reshape': + self.convert_reshape(op) + elif self.is_atrous_conv2d(op): + self.convert_atrous_conv2d(op) + elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative': + if self.check_winograd_conv(op): + self.convert_winograd_conv(op) + else: + self.convert_conv2d(op) + elif op.type == 'FusedBatchNorm': + self.convert_fused_batchnorm(op) + elif op.type == 'Add' and op.name.endswith('batchnorm/add'): + self.convert_batchnorm(op) + elif op.type == 'AvgPool' or op.type == 'MaxPool': + self.convert_pooling(op) + elif op.type == 'Relu6': + self.convert_relu6(op) + elif op.type == 'Add': + self.convert_add(op) + elif op.type == 'ConcatV2': + self.convert_concat(op) + elif op.type == 'ResizeBilinear': + self.convert_resize_bilinear(op) + elif op.type == 'BiasAdd': + self.convert_bias_add(op) + elif op.type == 'SpaceToBatchND': + self.convert_space_to_batch(op, False) + elif op.type == 'BatchToSpaceND': + self.convert_space_to_batch(op, True) + elif op.type == 'DepthToSpace': + self.convert_depth_to_space(op, True) + elif op.type == 'SpaceToDepth': + self.convert_depth_to_space(op, False) + elif op.type in ['Neg', 'neg', 'Negative', 'negative']: + self.convert_math(op, 'NEG') + elif op.type == 'Mul': + self.convert_math(op, 'MUL') + elif op.type == 'Sub': + self.convert_math(op, 'SUB') + elif self.is_softmax(op): + self.convert_softmax(op) + elif op.type in ['Relu', 'Sigmoid', 'Tanh']: + self.convert_activation(op) + # FIXME: hardcode for inception_v3 + elif op.type in ['Squeeze', 'Shape']: + self.resolved_ops[op.name] = 1 + elif op.type == 'Mean': + # Global avg pooling + reduce_dims = op.inputs[1].eval() + if reduce_dims[0] == 1 and reduce_dims[1] == 2: + self.convert_global_avg_pooling(op) + self.unused_tensor.add(op.inputs[1].name) + else: + raise Exception('Unknown Op: %s, type: %s' % (op.name, + op.type)) + # elif op.type in ['']: + # self.convert_normal_op(op) + else: + raise Exception('Unknown Op: %s, type: %s' % (op.name, + op.type)) + + for op in self.tf_ops: + if self.resolved_ops[op.name] == 1: + continue + elif op.type == 'Const': + self.convert_tensor(op) + else: + raise Exception('Unknown Op: %s, type: %s' % (op.name, + op.type)) + + if self.device == 'gpu': + self.add_gpu_output_transform(output_nodes) - if self.device == 'neon': - self.add_neon_output_transform(output_nodes) + if self.device == 'neon': + self.add_neon_output_transform(output_nodes) - if self.device == 'cpu': - self.replace_in_out_name(input_nodes, output_nodes) + if self.device == 'cpu': + self.replace_in_out_name(input_nodes, output_nodes) - for key in self.resolved_ops: - if self.resolved_ops[key] != 1: - print 'Unresolve Op: %s' % key + for key in self.resolved_ops: + if self.resolved_ops[key] != 1: + print 'Unresolve Op: %s' % key -class Optimizer: - def __init__(self, net_def, device): - self.net_def = net_def - self.device = device - self.mace_graph = {} - self.tensor_map = {} - for op in net_def.op: - for input_name in op.input: - if input_name not in self.mace_graph: - self.mace_graph[input_name] = [] - self.mace_graph[input_name].append(op) - - for tensor in net_def.tensors: - self.tensor_map[tensor.name] = tensor - - def get_buffer_tensor_name(self, name): - if self.device == 'gpu': - return name[:-6] + name[-2:] - else: - return name - - def fold_batch_norm(self): - unused_tensors = set() - new_tensors = [] - new_net = mace_pb2.NetDef() - resolved_ops = set() - - for op in self.net_def.op: - if op.name in resolved_ops: - pass - elif op.type == 'DepthwiseConv2d' and len(op.output) == 1 \ - and self.mace_graph[op.output[0]][0].type == 'FoldedBatchNorm': - depthwise_conv2d_op = op - folded_bn_op = self.mace_graph[op.output[0]][0] - weight_buffer_name = self.get_buffer_tensor_name(depthwise_conv2d_op.input[1]) - weight_tensor = self.tensor_map[weight_buffer_name] - scale_buffer_name = self.get_buffer_tensor_name(folded_bn_op.input[1]) - offset_buffer_name = self.get_buffer_tensor_name(folded_bn_op.input[2]) - scale_tensor = self.tensor_map[scale_buffer_name] - weight_shape = weight_tensor.dims - idx = 0 - if self.device == 'neon': # OIHW - for oc in range(weight_shape[0]): - for ic in range(weight_shape[1]): - for i in range(weight_shape[2]): - for j in range(weight_shape[3]): - weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[0] + oc] - idx += 1 - else: # HWIO - for i in range(weight_shape[0]): - for j in range(weight_shape[1]): - for ic in range(weight_shape[2]): - for oc in range(weight_shape[3]): - weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[3] + oc] - idx += 1 - - new_tensors.append(weight_tensor) - unused_tensors.add(weight_tensor.name) - unused_tensors.add(scale_tensor.name) +class Optimizer: + def __init__(self, net_def, device): + self.net_def = net_def + self.device = device + self.mace_graph = {} + self.tensor_map = {} + for op in net_def.op: + for input_name in op.input: + if input_name not in self.mace_graph: + self.mace_graph[input_name] = [] + self.mace_graph[input_name].append(op) + + for tensor in net_def.tensors: + self.tensor_map[tensor.name] = tensor + + def get_buffer_tensor_name(self, name): if self.device == 'gpu': - scale_b2i_op = self.mace_graph[scale_buffer_name][0] - offset_b2i_op = self.mace_graph[offset_buffer_name][0] - resolved_ops.add(scale_b2i_op.name) - resolved_ops.add(offset_b2i_op.name) - new_net.op.extend([offset_b2i_op]) - - resolved_ops.add(depthwise_conv2d_op.name) - resolved_ops.add(folded_bn_op.name) - - offset_tensor_name = folded_bn_op.input[2] - depthwise_conv2d_op.input.extend([offset_tensor_name]) - - for arg in folded_bn_op.arg: - if arg.name == 'activation': - act_arg = depthwise_conv2d_op.arg.add() - act_arg.name = arg.name - act_arg.s = arg.s - elif arg.name == 'max_limit': - act_arg = depthwise_conv2d_op.arg.add() - act_arg.name = arg.name - act_arg.f = arg.f - - depthwise_conv2d_op.output[0] = folded_bn_op.output[0] - new_net.op.extend([depthwise_conv2d_op]) - else: - new_net.op.extend([op]) - - for tensor in self.net_def.tensors: - if tensor.name in unused_tensors: - pass - else: - new_net.tensors.extend([tensor]) - - for tensor in new_tensors: - new_net.tensors.extend([tensor]) - - return new_net - - def optimize(self): - new_net = self.fold_batch_norm() - return new_net + return name[:-6] + name[-2:] + else: + return name + + def fold_batch_norm(self): + unused_tensors = set() + new_tensors = [] + new_net = mace_pb2.NetDef() + resolved_ops = set() + + for op in self.net_def.op: + if op.name in resolved_ops: + pass + elif op.type == 'DepthwiseConv2d' and len(op.output) == 1 and \ + self.mace_graph[op.output[0]][0].type == 'FoldedBatchNorm': + depthwise_conv2d_op = op + folded_bn_op = self.mace_graph[op.output[0]][0] + weight_buffer_name = self.get_buffer_tensor_name( + depthwise_conv2d_op.input[1]) + weight_tensor = self.tensor_map[weight_buffer_name] + scale_buffer_name = self.get_buffer_tensor_name( + folded_bn_op.input[1]) + offset_buffer_name = self.get_buffer_tensor_name( + folded_bn_op.input[2]) + scale_tensor = self.tensor_map[scale_buffer_name] + weight_shape = weight_tensor.dims + idx = 0 + if self.device == 'neon': # OIHW + for oc in range(weight_shape[0]): + for ic in range(weight_shape[1]): + for i in range(weight_shape[2]): + for j in range(weight_shape[3]): + weight_tensor.float_data[ + idx] *= scale_tensor.float_data[ + ic * weight_shape[0] + oc] + idx += 1 + else: # HWIO + for i in range(weight_shape[0]): + for j in range(weight_shape[1]): + for ic in range(weight_shape[2]): + for oc in range(weight_shape[3]): + weight_tensor.float_data[ + idx] *= scale_tensor.float_data[ + ic * weight_shape[3] + oc] + idx += 1 + + new_tensors.append(weight_tensor) + unused_tensors.add(weight_tensor.name) + unused_tensors.add(scale_tensor.name) + + if self.device == 'gpu': + scale_b2i_op = self.mace_graph[scale_buffer_name][0] + offset_b2i_op = self.mace_graph[offset_buffer_name][0] + resolved_ops.add(scale_b2i_op.name) + resolved_ops.add(offset_b2i_op.name) + new_net.op.extend([offset_b2i_op]) + + resolved_ops.add(depthwise_conv2d_op.name) + resolved_ops.add(folded_bn_op.name) + + offset_tensor_name = folded_bn_op.input[2] + depthwise_conv2d_op.input.extend([offset_tensor_name]) + + for arg in folded_bn_op.arg: + if arg.name == 'activation': + act_arg = depthwise_conv2d_op.arg.add() + act_arg.name = arg.name + act_arg.s = arg.s + elif arg.name == 'max_limit': + act_arg = depthwise_conv2d_op.arg.add() + act_arg.name = arg.name + act_arg.f = arg.f + + depthwise_conv2d_op.output[0] = folded_bn_op.output[0] + new_net.op.extend([depthwise_conv2d_op]) + else: + new_net.op.extend([op]) + + for tensor in self.net_def.tensors: + if tensor.name in unused_tensors: + pass + else: + new_net.tensors.extend([tensor]) + + for tensor in new_tensors: + new_net.tensors.extend([tensor]) + + return new_net + + def optimize(self): + new_net = self.fold_batch_norm() + return new_net + def add_shape_info(input_graph_def, input_nodes, input_shapes): - inputs_replaced_graph = graph_pb2.GraphDef() - for node in input_graph_def.node: - if node.name in input_nodes: - idx = input_nodes.index(node.name) - input_shape = input_shapes[idx] - placeholder_node = copy.deepcopy(node) - placeholder_node.attr.clear() - placeholder_node.attr['shape'].shape.dim.extend([ - tensor_shape_pb2.TensorShapeProto.Dim(size=i) for i in input_shape - ]) - placeholder_node.attr['dtype'].CopyFrom(node.attr['dtype']) - inputs_replaced_graph.node.extend([placeholder_node]) - else: - inputs_replaced_graph.node.extend([copy.deepcopy(node)]) - return inputs_replaced_graph - - -def convert_to_mace_pb(model_file, input_node, input_shape, output_node, data_type, device, winograd): - net_def = mace_pb2.NetDef() - dt = data_type_map[data_type] - - input_graph_def = tf.GraphDef() - with gfile.Open(model_file, "rb") as f: - data = f.read() - input_graph_def.ParseFromString(data) - - input_nodes = [x for x in input_node.split(',')] - input_shapes = [] - if input_shape != "": - input_shape_strs = [x for x in input_shape.split(':')] - for shape_str in input_shape_strs: - input_shapes.extend([[int(x) for x in shape_str.split(',')]]) - output_nodes = [x for x in output_node.split(',')] - assert len(input_nodes) == len(input_shapes) - - input_graph_def = add_shape_info(input_graph_def, input_nodes, input_shapes) - with tf.Session() as session: - with session.graph.as_default() as graph: - tf.import_graph_def(input_graph_def, name="") - ops = graph.get_operations() - converter = TFConverter(ops, net_def, dt, device, winograd) - converter.convert(input_nodes, output_nodes) - optimizer = Optimizer(net_def, device) - net_def = optimizer.optimize() - print "Model Converted." - if device == 'gpu': - print "start optimize memory." - mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) - mem_optimizer.optimize() - print "Memory optimization done." - - return net_def + inputs_replaced_graph = graph_pb2.GraphDef() + for node in input_graph_def.node: + if node.name in input_nodes: + idx = input_nodes.index(node.name) + input_shape = input_shapes[idx] + placeholder_node = copy.deepcopy(node) + placeholder_node.attr.clear() + placeholder_node.attr['shape'].shape.dim.extend([ + tensor_shape_pb2.TensorShapeProto.Dim(size=i) + for i in input_shape + ]) + placeholder_node.attr['dtype'].CopyFrom(node.attr['dtype']) + inputs_replaced_graph.node.extend([placeholder_node]) + else: + inputs_replaced_graph.node.extend([copy.deepcopy(node)]) + return inputs_replaced_graph + + +def convert_to_mace_pb(model_file, input_node, input_shape, output_node, + data_type, device, winograd): + net_def = mace_pb2.NetDef() + dt = data_type_map[data_type] + + input_graph_def = tf.GraphDef() + with gfile.Open(model_file, "rb") as f: + data = f.read() + input_graph_def.ParseFromString(data) + + input_nodes = [x for x in input_node.split(',')] + input_shapes = [] + if input_shape != "": + input_shape_strs = [x for x in input_shape.split(':')] + for shape_str in input_shape_strs: + input_shapes.extend([[int(x) for x in shape_str.split(',')]]) + output_nodes = [x for x in output_node.split(',')] + assert len(input_nodes) == len(input_shapes) + + input_graph_def = add_shape_info(input_graph_def, input_nodes, + input_shapes) + with tf.Session() as session: + with session.graph.as_default() as graph: + tf.import_graph_def(input_graph_def, name="") + ops = graph.get_operations() + converter = TFConverter(ops, net_def, dt, device, winograd) + converter.convert(input_nodes, output_nodes) + optimizer = Optimizer(net_def, device) + net_def = optimizer.optimize() + print "Model Converted." + if device == 'gpu': + print "start optimize memory." + mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) + mem_optimizer.optimize() + print "Memory optimization done." + + return net_def diff --git a/mace/python/tools/tf_dsp_converter_lib.py b/mace/python/tools/tf_dsp_converter_lib.py index 7c2da02a..62269cf0 100644 --- a/mace/python/tools/tf_dsp_converter_lib.py +++ b/mace/python/tools/tf_dsp_converter_lib.py @@ -6,452 +6,521 @@ from dsp_ops import DspOps from mace.python.tools import graph_util from mace.python.tools.convert_util import tf_dtype_2_mace_dtype -# converter --input ../libcv/quantized_model.pb --output quantized_model_dsp.pb \ -# --runtime dsp --input_node input_node --output_node output_node +# converter --input ../libcv/quantized_model.pb \ +# --output quantized_model_dsp.pb \ +# --runtime dsp --input_node input_node \ +# --output_node output_node padding_mode = { - 'NA': 0, - 'SAME': 1, - 'VALID': 2, - 'MIRROR_REFLECT': 3, - 'MIRROR_SYMMETRIC': 4, - 'SAME_CAFFE': 5 + 'NA': 0, + 'SAME': 1, + 'VALID': 2, + 'MIRROR_REFLECT': 3, + 'MIRROR_SYMMETRIC': 4, + 'SAME_CAFFE': 5 } + def get_tensor_name_from_op(op_name, port): - return op_name + ':' + str(port) + return op_name + ':' + str(port) + def get_node_from_map(op_map, op_or_tensor_name): - op_name = op_or_tensor_name.split(':')[0] - return op_map[op_name] + op_name = op_or_tensor_name.split(':')[0] + return op_map[op_name] + def get_op_and_port_from_tensor(tensor_name): - op, port = tensor_name.split(':') - port = int(port) - return op, port + op, port = tensor_name.split(':') + port = int(port) + return op, port + def max_elem_size(tensor): - if len(tensor.shape.as_list()) == 0: - return tensor.dtype.size - else: - return reduce(mul, tensor.shape.as_list()) * tensor.dtype.size + if len(tensor.shape.as_list()) == 0: + return tensor.dtype.size + else: + return reduce(mul, tensor.shape.as_list()) * tensor.dtype.size + def find_dtype(tensor_dtype): - if tensor_dtype == tf.float32: - return mace_pb2.DT_FLOAT - elif tensor_dtype == tf.uint8 or tensor_dtype == tf.quint8: - return mace_pb2.DT_UINT8 - elif tensor_dtype == tf.int32 or tensor_dtype == tf.qint32: - return mace_pb2.DT_INT32 - else: - raise Exception('Unsupported data type: ', tensor_dtype) + if tensor_dtype == tf.float32: + return mace_pb2.DT_FLOAT + elif tensor_dtype == tf.uint8 or tensor_dtype == tf.quint8: + return mace_pb2.DT_UINT8 + elif tensor_dtype == tf.int32 or tensor_dtype == tf.qint32: + return mace_pb2.DT_INT32 + else: + raise Exception('Unsupported data type: ', tensor_dtype) + def has_padding_and_strides(op): - return 'padding' in op.node_def.attr and 'strides' in op.node_def.attr + return 'padding' in op.node_def.attr and 'strides' in op.node_def.attr + def is_node_flatten_reshape(op): - return op.type == 'Reshape' and len(op.outputs[0].shape) == 1 + return op.type == 'Reshape' and len(op.outputs[0].shape) == 1 + def get_input_tensor(op, index): - input_tensor = op.inputs[index] - if input_tensor.op.type == 'Reshape': - input_tensor = get_input_tensor(input_tensor.op, 0) - return input_tensor + input_tensor = op.inputs[index] + if input_tensor.op.type == 'Reshape': + input_tensor = get_input_tensor(input_tensor.op, 0) + return input_tensor + def add_shape_const_node(net_def, op, values, name): - print ('Add const node: ', op.name + '/' + name) - tensor = net_def.tensors.add() - node_name = op.name + '/' + name - tensor.name = node_name + ':0' - tensor.data_type = mace_pb2.DT_INT32 - tensor.dims.extend(values) - return tensor.name + print('Add const node: ', op.name + '/' + name) + tensor = net_def.tensors.add() + node_name = op.name + '/' + name + tensor.name = node_name + ':0' + tensor.data_type = mace_pb2.DT_INT32 + tensor.dims.extend(values) + return tensor.name def convert_op_outputs(mace_op_def, tf_op): - mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype) - for output in tf_op.outputs]) - output_shapes = [] - for output in tf_op.outputs: - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(output.shape.as_list()) - output_shapes.append(output_shape) - mace_op_def.output_shape.extend(output_shapes) + mace_op_def.output_type.extend( + [tf_dtype_2_mace_dtype(output.dtype) for output in tf_op.outputs]) + output_shapes = [] + for output in tf_op.outputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + mace_op_def.output_shape.extend(output_shapes) def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops): - first_op = unresolved_ops[0] - print ('Op: ', first_op.name, first_op.type, first_op.outputs[0].shape) + first_op = unresolved_ops[0] + print('Op: ', first_op.name, first_op.type, first_op.outputs[0].shape) + + if first_op.name in resolved_ops: + pass + + elif first_op.type == 'Const': + print('Add const node: ', first_op.name) + tf_tensor = first_op.outputs[0].eval() + tensor = net_def.tensors.add() + tensor.name = first_op.outputs[0].name + tensor.data_type = find_dtype(first_op.outputs[0].dtype) + shape = list(tf_tensor.shape) + if len(shape) > 0: + tensor.dims.extend(shape) + if first_op.outputs[0].dtype == tf.float32: + tensor.float_data.extend(tf_tensor.astype(float).flat) + elif first_op.outputs[0].dtype == tf.int32 or \ + first_op.outputs[0].dtype == tf.int8 or \ + first_op.outputs[0].dtype == tf.int16 or \ + first_op.outputs[0].dtype == tf.quint8 or \ + first_op.outputs[0].dtype == tf.quint16: + tensor.int32_data.extend(tf_tensor.astype(int).flat) - if first_op.name in resolved_ops: - pass - - elif first_op.type == 'Const': - print ('Add const node: ', first_op.name) - tf_tensor = first_op.outputs[0].eval() - tensor = net_def.tensors.add() - tensor.name = first_op.outputs[0].name - tensor.data_type = find_dtype(first_op.outputs[0].dtype) - shape = list(tf_tensor.shape) - if len(shape) > 0: - tensor.dims.extend(shape) - if first_op.outputs[0].dtype == tf.float32: - tensor.float_data.extend(tf_tensor.astype(float).flat) - elif first_op.outputs[0].dtype == tf.int32 or \ - first_op.outputs[0].dtype == tf.int8 or \ - first_op.outputs[0].dtype == tf.int16 or \ - first_op.outputs[0].dtype == tf.quint8 or \ - first_op.outputs[0].dtype == tf.quint16: - tensor.int32_data.extend(tf_tensor.astype(int).flat) - - else: - op_def = net_def.op.add() - op_def.name = first_op.name - op_def.type = dsp_ops.map_nn_op(first_op.type) - op_def.padding = padding_mode['NA'] - - if len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \ - and len(first_op.outputs[0].consumers()) > 0 \ - and (first_op.outputs[0].consumers()[0].type == 'SpaceToBatchND' \ - or first_op.outputs[0].consumers()[0].type == 'BatchToSpaceND'): - input_tensor = first_op.inputs[0] - min_tensor = first_op.inputs[1] - max_tensor = first_op.inputs[2] - s2b_op = first_op.outputs[0].consumers()[0] - reshape_op = s2b_op.outputs[0].consumers()[0] - min_op = reshape_op.outputs[0].consumers()[0] - max_op = reshape_op.outputs[0].consumers()[1] - quantize_op = min_op.outputs[0].consumers()[0] - resolved_ops.add(s2b_op.name) - resolved_ops.add(reshape_op.name) - resolved_ops.add(min_op.name) - resolved_ops.add(max_op.name) - resolved_ops.add(quantize_op.name) - - op_def.name = quantize_op.name - op_def.type = dsp_ops.map_nn_op('Quantized' + s2b_op.type) - op_def.input.append(input_tensor.name) - op_def.input.extend([t.name for t in s2b_op.inputs[1:]]) - op_def.input.extend([min_tensor.name, max_tensor.name]) - op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_op.outputs]) - convert_op_outputs(op_def, quantize_op) - elif len(first_op.outputs) > 0 and first_op.type == 'QuantizedReshape' \ - and len(first_op.outputs[0].consumers()) > 0 \ - and first_op.outputs[0].consumers()[0].type == 'Dequantize' \ - and len(first_op.outputs[0].consumers()[0].outputs[0].consumers()) > 0 \ - and first_op.outputs[0].consumers()[0].outputs[0].consumers()[0].type == 'Softmax': - input_tensor = first_op.inputs[0] - min_tensor = first_op.inputs[2] - max_tensor = first_op.inputs[3] - dequantize_op = first_op.outputs[0].consumers()[0] - softmax_op = dequantize_op.outputs[0].consumers()[0] - reshape_op = softmax_op.outputs[0].consumers()[0] - min_op = reshape_op.outputs[0].consumers()[0] - max_op = reshape_op.outputs[0].consumers()[1] - quantize_op = min_op.outputs[0].consumers()[0] - quantize_reshape_op = quantize_op.outputs[0].consumers()[0] - - resolved_ops.add(dequantize_op.name) - resolved_ops.add(softmax_op.name) - resolved_ops.add(reshape_op.name) - resolved_ops.add(min_op.name) - resolved_ops.add(max_op.name) - resolved_ops.add(quantize_op.name) - resolved_ops.add(quantize_reshape_op.name) - - op_def.name = quantize_reshape_op.name - op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax') - op_def.input.extend([input_tensor.name, min_tensor.name, max_tensor.name]) - op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_reshape_op.outputs]) - convert_op_outputs(op_def, quantize_reshape_op) - elif len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \ - and len(first_op.outputs[0].consumers()) > 0 \ - and first_op.outputs[0].consumers()[0].type == 'Tanh': - input_tensor = first_op.inputs[0] - min_tensor = first_op.inputs[1] - max_tensor = first_op.inputs[2] - tanh_op = first_op.outputs[0].consumers()[0] - - # if not last op - resolved_ops.add(tanh_op.name) - if tanh_op.outputs[0].consumers(): - reshape_op = tanh_op.outputs[0].consumers()[0] - min_op = reshape_op.outputs[0].consumers()[0] - max_op = reshape_op.outputs[0].consumers()[1] - quantize_op = min_op.outputs[0].consumers()[0] - resolved_ops.add(reshape_op.name) - resolved_ops.add(min_op.name) - resolved_ops.add(max_op.name) - resolved_ops.add(quantize_op.name) - - op_def.name = quantize_op.name - op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type) - op_def.input.extend([input_tensor.name, min_tensor.name, max_tensor.name]) - op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_op.outputs]) - convert_op_outputs(op_def, quantize_op) - # tanh is last op - else: - op_def.name = tanh_op.name + '/QuantizedTanh' - op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type) - op_def.input.extend([input_tensor.name, min_tensor.name, max_tensor.name]) - op_def.out_max_byte_size.extend([max_elem_size(input_tensor), - max_elem_size(min_tensor), - max_elem_size(max_tensor)]) - op_def.output_type.extend([mace_pb2.DT_UINT8, mace_pb2.DT_FLOAT, mace_pb2.DT_FLOAT]) - output_shapes = [] - for output in first_op.inputs: - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(output.shape.as_list()) - output_shapes.append(output_shape) - op_def.output_shape.extend(output_shapes) - - new_tanh_op_def = net_def.op.add() - new_tanh_op_def.name = tanh_op.name - new_tanh_op_def.type = dsp_ops.map_nn_op('Dequantize') - new_tanh_op_def.input.extend([get_tensor_name_from_op(op_def.name, 0), - get_tensor_name_from_op(op_def.name, 1), - get_tensor_name_from_op(op_def.name, 2)]) - new_tanh_op_def.out_max_byte_size.extend([max_elem_size(tanh_op.outputs[0])]) - convert_op_outputs(new_tanh_op_def, tanh_op) - elif has_padding_and_strides(first_op): - op_def.padding = padding_mode[first_op.get_attr('padding')] - op_def.input.extend([t.name for t in first_op.inputs]) - if 'ksize' in first_op.node_def.attr: - ksize = first_op.get_attr('ksize') - ksize_tensor = add_shape_const_node(net_def, first_op, ksize, 'ksize') - op_def.input.extend([ksize_tensor]) - strides = first_op.get_attr('strides') - strides_tensor = add_shape_const_node(net_def, first_op, strides, 'strides') - op_def.input.extend([strides_tensor]) - op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs]) - convert_op_outputs(op_def, first_op) - elif is_node_flatten_reshape(first_op): - op_def.type = 'Flatten' - op_def.input.extend([t.name for t in first_op.inputs]) - op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs]) - convert_op_outputs(op_def, first_op) - elif dsp_ops.has_op(first_op.type): - op_def.input.extend([t.name for t in first_op.inputs]) - op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs]) - convert_op_outputs(op_def, first_op) else: - raise Exception('Unsupported op: ', first_op) + op_def = net_def.op.add() + op_def.name = first_op.name + op_def.type = dsp_ops.map_nn_op(first_op.type) + op_def.padding = padding_mode['NA'] + + if len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \ + and len(first_op.outputs[0].consumers()) > 0 \ + and (first_op.outputs[0].consumers()[0].type == 'SpaceToBatchND' or + first_op.outputs[0].consumers()[0].type == 'BatchToSpaceND'): + input_tensor = first_op.inputs[0] + min_tensor = first_op.inputs[1] + max_tensor = first_op.inputs[2] + s2b_op = first_op.outputs[0].consumers()[0] + reshape_op = s2b_op.outputs[0].consumers()[0] + min_op = reshape_op.outputs[0].consumers()[0] + max_op = reshape_op.outputs[0].consumers()[1] + quantize_op = min_op.outputs[0].consumers()[0] + resolved_ops.add(s2b_op.name) + resolved_ops.add(reshape_op.name) + resolved_ops.add(min_op.name) + resolved_ops.add(max_op.name) + resolved_ops.add(quantize_op.name) + + op_def.name = quantize_op.name + op_def.type = dsp_ops.map_nn_op('Quantized' + s2b_op.type) + op_def.input.append(input_tensor.name) + op_def.input.extend([t.name for t in s2b_op.inputs[1:]]) + op_def.input.extend([min_tensor.name, max_tensor.name]) + op_def.out_max_byte_size.extend( + [max_elem_size(out) for out in quantize_op.outputs]) + convert_op_outputs(op_def, quantize_op) + elif len(first_op.outputs) > 0 and \ + first_op.type == 'QuantizedReshape' and \ + len(first_op.outputs[0].consumers()) > 0 and \ + first_op.outputs[0].consumers()[0].type == 'Dequantize' and \ + len(first_op.outputs[0].consumers()[0].outputs[0].consumers()) \ + > 0 and \ + first_op.outputs[0].consumers()[0].outputs[0].consumers()[0].type \ + == 'Softmax': + input_tensor = first_op.inputs[0] + min_tensor = first_op.inputs[2] + max_tensor = first_op.inputs[3] + dequantize_op = first_op.outputs[0].consumers()[0] + softmax_op = dequantize_op.outputs[0].consumers()[0] + reshape_op = softmax_op.outputs[0].consumers()[0] + min_op = reshape_op.outputs[0].consumers()[0] + max_op = reshape_op.outputs[0].consumers()[1] + quantize_op = min_op.outputs[0].consumers()[0] + quantize_reshape_op = quantize_op.outputs[0].consumers()[0] + + resolved_ops.add(dequantize_op.name) + resolved_ops.add(softmax_op.name) + resolved_ops.add(reshape_op.name) + resolved_ops.add(min_op.name) + resolved_ops.add(max_op.name) + resolved_ops.add(quantize_op.name) + resolved_ops.add(quantize_reshape_op.name) + + op_def.name = quantize_reshape_op.name + op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax') + op_def.input.extend( + [input_tensor.name, min_tensor.name, max_tensor.name]) + op_def.out_max_byte_size.extend( + [max_elem_size(out) for out in quantize_reshape_op.outputs]) + convert_op_outputs(op_def, quantize_reshape_op) + elif len(first_op.outputs) > 0 and first_op.type == 'Dequantize' and \ + len(first_op.outputs[0].consumers()) > 0 and \ + first_op.outputs[0].consumers()[0].type == 'Tanh': + input_tensor = first_op.inputs[0] + min_tensor = first_op.inputs[1] + max_tensor = first_op.inputs[2] + tanh_op = first_op.outputs[0].consumers()[0] + + # if not last op + resolved_ops.add(tanh_op.name) + if tanh_op.outputs[0].consumers(): + reshape_op = tanh_op.outputs[0].consumers()[0] + min_op = reshape_op.outputs[0].consumers()[0] + max_op = reshape_op.outputs[0].consumers()[1] + quantize_op = min_op.outputs[0].consumers()[0] + resolved_ops.add(reshape_op.name) + resolved_ops.add(min_op.name) + resolved_ops.add(max_op.name) + resolved_ops.add(quantize_op.name) + + op_def.name = quantize_op.name + op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type) + op_def.input.extend( + [input_tensor.name, min_tensor.name, max_tensor.name]) + op_def.out_max_byte_size.extend( + [max_elem_size(out) for out in quantize_op.outputs]) + convert_op_outputs(op_def, quantize_op) + # tanh is last op + else: + op_def.name = tanh_op.name + '/QuantizedTanh' + op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type) + op_def.input.extend( + [input_tensor.name, min_tensor.name, max_tensor.name]) + op_def.out_max_byte_size.extend([ + max_elem_size(input_tensor), + max_elem_size(min_tensor), + max_elem_size(max_tensor) + ]) + op_def.output_type.extend( + [mace_pb2.DT_UINT8, mace_pb2.DT_FLOAT, mace_pb2.DT_FLOAT]) + output_shapes = [] + for output in first_op.inputs: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op_def.output_shape.extend(output_shapes) + + new_tanh_op_def = net_def.op.add() + new_tanh_op_def.name = tanh_op.name + new_tanh_op_def.type = dsp_ops.map_nn_op('Dequantize') + new_tanh_op_def.input.extend([ + get_tensor_name_from_op(op_def.name, 0), + get_tensor_name_from_op(op_def.name, 1), + get_tensor_name_from_op(op_def.name, 2) + ]) + new_tanh_op_def.out_max_byte_size.extend( + [max_elem_size(tanh_op.outputs[0])]) + convert_op_outputs(new_tanh_op_def, tanh_op) + elif has_padding_and_strides(first_op): + op_def.padding = padding_mode[first_op.get_attr('padding')] + op_def.input.extend([t.name for t in first_op.inputs]) + if 'ksize' in first_op.node_def.attr: + ksize = first_op.get_attr('ksize') + ksize_tensor = add_shape_const_node(net_def, first_op, ksize, + 'ksize') + op_def.input.extend([ksize_tensor]) + strides = first_op.get_attr('strides') + strides_tensor = add_shape_const_node(net_def, first_op, strides, + 'strides') + op_def.input.extend([strides_tensor]) + op_def.out_max_byte_size.extend( + [max_elem_size(out) for out in first_op.outputs]) + convert_op_outputs(op_def, first_op) + elif is_node_flatten_reshape(first_op): + op_def.type = 'Flatten' + op_def.input.extend([t.name for t in first_op.inputs]) + op_def.out_max_byte_size.extend( + [max_elem_size(out) for out in first_op.outputs]) + convert_op_outputs(op_def, first_op) + elif dsp_ops.has_op(first_op.type): + op_def.input.extend([t.name for t in first_op.inputs]) + op_def.out_max_byte_size.extend( + [max_elem_size(out) for out in first_op.outputs]) + convert_op_outputs(op_def, first_op) + else: + raise Exception('Unsupported op: ', first_op) + + resolved_ops.add(first_op.name) + + del unresolved_ops[0] - resolved_ops.add(first_op.name) - - del unresolved_ops[0] def add_output_node(net_def, output_node): - op_def = net_def.op.add() - op_def.name = '__output__' - op_def.type = 'OUTPUT' - op_def.input.extend([get_tensor_name_from_op(output_node, 0)]) + op_def = net_def.op.add() + op_def.name = '__output__' + op_def.type = 'OUTPUT' + op_def.input.extend([get_tensor_name_from_op(output_node, 0)]) + def reverse_batch_to_space_and_biasadd(net_def): - tensor_map = {} - for tensor in net_def.tensors: - tensor_map[tensor.name] = tensor - op_map = {} - for op in net_def.op: - op_map[op.name] = op - consumers = {} - for op in net_def.op: - for ipt in op.input: - if ipt not in consumers: - consumers[ipt] = [] - consumers[ipt].append(op) - - new_ops = [] - skip_ops = set() - visited_ops = set() - - for op in net_def.op: - if op.name in visited_ops: - pass - # pattern: QConv -> RR -> R -> QB2S -> QBiasAdd -> RR -> R - success = False - if op.type == 'Requantize_32to8': - biasadd_requantize_op = op - biasadd_op = get_node_from_map(op_map, biasadd_requantize_op.input[0]) - if biasadd_op.type == 'QuantizedBiasAdd_8p8to32': - b2s_op = get_node_from_map(op_map, biasadd_op.input[0]) - if b2s_op.type == 'QuantizedBatchToSpaceND_8': - conv_requantize_op = get_node_from_map(op_map, b2s_op.input[0]) - conv_op = get_node_from_map(op_map, conv_requantize_op.input[0]) - if conv_op.type == 'QuantizedConv2d_8x8to32': - new_biasadd_op = mace_pb2.OperatorDef() - new_biasadd_op.CopyFrom(biasadd_op) - new_biasadd_op.input[0] = get_tensor_name_from_op(conv_requantize_op.name, 0) - new_biasadd_op.input[2] = get_tensor_name_from_op(conv_requantize_op.name, 1) - new_biasadd_op.input[3] = get_tensor_name_from_op(conv_requantize_op.name, 2) - new_biasadd_op.out_max_byte_size[0] = conv_requantize_op.out_max_byte_size[0] * 4 - - new_biasadd_requantize_op = mace_pb2.OperatorDef() - new_biasadd_requantize_op.CopyFrom(biasadd_requantize_op) - new_biasadd_requantize_op.out_max_byte_size[0] = new_biasadd_op.out_max_byte_size[0] / 4 - - new_b2s_op = mace_pb2.OperatorDef() - new_b2s_op.CopyFrom(b2s_op) - new_b2s_op.input[0] = get_tensor_name_from_op(biasadd_requantize_op.name, 0) - new_b2s_op.input[3] = get_tensor_name_from_op(biasadd_requantize_op.name, 1) - new_b2s_op.input[4] = get_tensor_name_from_op(biasadd_requantize_op.name, 2) - - new_ops.extend([new_biasadd_op, new_biasadd_requantize_op, new_b2s_op]) - skip_ops = skip_ops.union([biasadd_op.name, biasadd_requantize_op.name, b2s_op.name]) - visited_ops.add(op.name) - - follow_ops = consumers[get_tensor_name_from_op(biasadd_requantize_op.name, 0)] - for follow_op in follow_ops: - new_follow_op = mace_pb2.OperatorDef() - new_follow_op.CopyFrom(follow_op) - for i in xrange(len(follow_op.input)): - for k in xrange(3): - if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k): - new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k) - new_ops.append(new_follow_op) - skip_ops.add(follow_op.name) - visited_ops.add(follow_op.name) - - visited_ops.add(op.name) - - new_net_def = mace_pb2.NetDef() - new_net_def.tensors.extend(tensor_map.values()) - new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops]) - new_net_def.op.extend(new_ops) - - return new_net_def + tensor_map = {} + for tensor in net_def.tensors: + tensor_map[tensor.name] = tensor + op_map = {} + for op in net_def.op: + op_map[op.name] = op + consumers = {} + for op in net_def.op: + for ipt in op.input: + if ipt not in consumers: + consumers[ipt] = [] + consumers[ipt].append(op) + + new_ops = [] + skip_ops = set() + visited_ops = set() + + for op in net_def.op: + if op.name in visited_ops: + pass + # pattern: QConv -> RR -> R -> QB2S -> QBiasAdd -> RR -> R + success = False + if op.type == 'Requantize_32to8': + biasadd_requantize_op = op + biasadd_op = get_node_from_map(op_map, + biasadd_requantize_op.input[0]) + if biasadd_op.type == 'QuantizedBiasAdd_8p8to32': + b2s_op = get_node_from_map(op_map, biasadd_op.input[0]) + if b2s_op.type == 'QuantizedBatchToSpaceND_8': + conv_requantize_op = get_node_from_map( + op_map, b2s_op.input[0]) + conv_op = get_node_from_map(op_map, + conv_requantize_op.input[0]) + if conv_op.type == 'QuantizedConv2d_8x8to32': + new_biasadd_op = mace_pb2.OperatorDef() + new_biasadd_op.CopyFrom(biasadd_op) + new_biasadd_op.input[0] = get_tensor_name_from_op( + conv_requantize_op.name, 0) + new_biasadd_op.input[2] = get_tensor_name_from_op( + conv_requantize_op.name, 1) + new_biasadd_op.input[3] = get_tensor_name_from_op( + conv_requantize_op.name, 2) + new_biasadd_op.out_max_byte_size[ + 0] = conv_requantize_op.out_max_byte_size[0] * 4 + + new_biasadd_requantize_op = mace_pb2.OperatorDef() + new_biasadd_requantize_op.CopyFrom( + biasadd_requantize_op) + new_biasadd_requantize_op.out_max_byte_size[ + 0] = new_biasadd_op.out_max_byte_size[0] / 4 + + new_b2s_op = mace_pb2.OperatorDef() + new_b2s_op.CopyFrom(b2s_op) + new_b2s_op.input[0] = get_tensor_name_from_op( + biasadd_requantize_op.name, 0) + new_b2s_op.input[3] = get_tensor_name_from_op( + biasadd_requantize_op.name, 1) + new_b2s_op.input[4] = get_tensor_name_from_op( + biasadd_requantize_op.name, 2) + + new_ops.extend([ + new_biasadd_op, new_biasadd_requantize_op, + new_b2s_op + ]) + skip_ops = skip_ops.union([ + biasadd_op.name, biasadd_requantize_op.name, + b2s_op.name + ]) + visited_ops.add(op.name) + + follow_ops = consumers[get_tensor_name_from_op( + biasadd_requantize_op.name, 0)] + for follow_op in follow_ops: + new_follow_op = mace_pb2.OperatorDef() + new_follow_op.CopyFrom(follow_op) + for i in xrange(len(follow_op.input)): + for k in xrange(3): + if new_follow_op.input[ + i] == get_tensor_name_from_op( + biasadd_requantize_op.name, k): + new_follow_op.input[ + i] = get_tensor_name_from_op( + b2s_op.name, k) + new_ops.append(new_follow_op) + skip_ops.add(follow_op.name) + visited_ops.add(follow_op.name) + + visited_ops.add(op.name) + + new_net_def = mace_pb2.NetDef() + new_net_def.tensors.extend(tensor_map.values()) + new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops]) + new_net_def.op.extend(new_ops) + + return new_net_def + def add_node_id(net_def): - node_id_counter = 0 - node_id_map = {} - for tensor in net_def.tensors: - tensor.node_id = node_id_counter - node_id_counter += 1 - tensor_op, port = get_op_and_port_from_tensor(tensor.name) - node_id_map[tensor_op] = tensor.node_id - - for op in net_def.op: - op.node_id = node_id_counter - node_id_counter += 1 - node_id_map[op.name] = op.node_id - for ipt in op.input: - op_name, port = get_op_and_port_from_tensor(ipt) - node_id = node_id_map[op_name] - node_input = op.node_input.add() - node_input.node_id = node_id - node_input.output_port = int(port) - - return net_def + node_id_counter = 0 + node_id_map = {} + for tensor in net_def.tensors: + tensor.node_id = node_id_counter + node_id_counter += 1 + tensor_op, port = get_op_and_port_from_tensor(tensor.name) + node_id_map[tensor_op] = tensor.node_id + + for op in net_def.op: + op.node_id = node_id_counter + node_id_counter += 1 + node_id_map[op.name] = op.node_id + for ipt in op.input: + op_name, port = get_op_and_port_from_tensor(ipt) + node_id = node_id_map[op_name] + node_input = op.node_input.add() + node_input.node_id = node_id + node_input.output_port = int(port) + + return net_def + def add_input_output_info(net_def, input_node, output_node, graph, dtype): - input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0)) - output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0)) - - input_info = net_def.input_info.add() - input_info.dims.extend(input_tensor.shape.as_list()) - input_info.data_type = dtype - if dtype == mace_pb2.DT_UINT8: - for i in xrange(2): - input_info = net_def.input_info.add() - input_info.dims.extend([1,1,1,1]) - input_info.data_type = mace_pb2.DT_FLOAT - - output_info = net_def.output_info.add() - output_info.dims.extend(output_tensor.shape.as_list()) - output_info.data_type = dtype - if dtype == mace_pb2.DT_UINT8: - for i in xrange(2): - output_info = net_def.output_info.add() - output_info.dims.extend([1,1,1,1]) - output_info.data_type = mace_pb2.DT_FLOAT - - return net_def + input_tensor = graph.get_tensor_by_name( + get_tensor_name_from_op(input_node, 0)) + output_tensor = graph.get_tensor_by_name( + get_tensor_name_from_op(output_node, 0)) + + input_info = net_def.input_info.add() + input_info.dims.extend(input_tensor.shape.as_list()) + input_info.data_type = dtype + if dtype == mace_pb2.DT_UINT8: + for i in xrange(2): + input_info = net_def.input_info.add() + input_info.dims.extend([1, 1, 1, 1]) + input_info.data_type = mace_pb2.DT_FLOAT + + output_info = net_def.output_info.add() + output_info.dims.extend(output_tensor.shape.as_list()) + output_info.data_type = dtype + if dtype == mace_pb2.DT_UINT8: + for i in xrange(2): + output_info = net_def.output_info.add() + output_info.dims.extend([1, 1, 1, 1]) + output_info.data_type = mace_pb2.DT_FLOAT + + return net_def + def fuse_quantize(net_def, input_node, output_node): - tensor_map = {} - for tensor in net_def.tensors: - tensor_map[tensor.name] = tensor - op_map = {} - for op in net_def.op: - op_map[op.name] = op - consumers = {} - for op in net_def.op: - for ipt in op.input: - if ipt not in consumers: - consumers[ipt] = [] - consumers[ipt].append(op) - - skip_ops = set() - new_ops = [] - skip_tensors = set() - - # INPUT->Flatten->Minf, Maxf->Quantize - for op in net_def.op: - if op.type == 'INPUT': - input_op = op - flatten_op = None - quantize_op = None - for o in consumers[get_tensor_name_from_op(input_op.name, 0)]: - if o.type == 'Flatten': - flatten_op = o - elif o.type == 'Quantize': - quantize_op = o - if quantize_op is not None: - minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)] - skip_ops = skip_ops.union([flatten_op.name, minf_op.name, maxf_op.name]) - skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]]) - quantize_op.type = 'AutoQuantize' - del quantize_op.input[1:] - - new_net_def = mace_pb2.NetDef() - new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors]) - new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops]) - new_net_def.op.extend(new_ops) - return new_net_def + tensor_map = {} + for tensor in net_def.tensors: + tensor_map[tensor.name] = tensor + op_map = {} + for op in net_def.op: + op_map[op.name] = op + consumers = {} + for op in net_def.op: + for ipt in op.input: + if ipt not in consumers: + consumers[ipt] = [] + consumers[ipt].append(op) + + skip_ops = set() + new_ops = [] + skip_tensors = set() + + # INPUT->Flatten->Minf, Maxf->Quantize + for op in net_def.op: + if op.type == 'INPUT': + input_op = op + flatten_op = None + quantize_op = None + for o in consumers[get_tensor_name_from_op(input_op.name, 0)]: + if o.type == 'Flatten': + flatten_op = o + elif o.type == 'Quantize': + quantize_op = o + if quantize_op is not None: + minf_op, maxf_op = consumers[get_tensor_name_from_op( + flatten_op.name, 0)] + skip_ops = skip_ops.union( + [flatten_op.name, minf_op.name, maxf_op.name]) + skip_tensors = skip_tensors.union( + [flatten_op.input[1], minf_op.input[1], maxf_op.input[1]]) + quantize_op.type = 'AutoQuantize' + del quantize_op.input[1:] + + new_net_def = mace_pb2.NetDef() + new_net_def.tensors.extend([ + tensor for tensor in net_def.tensors if tensor.name not in skip_tensors + ]) + new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops]) + new_net_def.op.extend(new_ops) + return new_net_def + def convert_to_mace_pb(model_file, input_node, output_node, dsp_mode): - """ + """ nnlib does not have batch norm, so use tensorflow optimizer to fold batch norm with convolution. The fold optimization reorders ops, so we sort ops first by topology. """ - input_graph_def = tf.GraphDef() - with gfile.Open(model_file, "rb") as f: - data = f.read() - input_graph_def.ParseFromString(data) - - input_graph_def = graph_util.sort_tf_graph(input_graph_def) - net_def = mace_pb2.NetDef() - - with tf.Session() as session: - with session.graph.as_default() as graph: - tf.import_graph_def(input_graph_def, name="") - ops = graph.get_operations() - dsp_ops = DspOps() - resolved_ops = set() - # convert const node - unresolved_ops = [op for op in ops if op.type == 'Const'] - while len(unresolved_ops) > 0: - convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops) - - # convert op node - unresolved_ops = [op for op in ops if op.type != 'Const'] - while len(unresolved_ops) > 0: - convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops) - - add_output_node(net_def, output_node) - net_def = reverse_batch_to_space_and_biasadd(net_def) - net_def = fuse_quantize(net_def, input_node, output_node) - - sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__') - net_def_with_node_id = add_node_id(sorted_net_def) - - dtype = mace_pb2.DT_FLOAT - final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype) - - arg = final_net_def.arg.add() - arg.name = 'dsp_mode' - arg.i = dsp_mode - - return final_net_def - + input_graph_def = tf.GraphDef() + with gfile.Open(model_file, "rb") as f: + data = f.read() + input_graph_def.ParseFromString(data) + + input_graph_def = graph_util.sort_tf_graph(input_graph_def) + net_def = mace_pb2.NetDef() + + with tf.Session() as session: + with session.graph.as_default() as graph: + tf.import_graph_def(input_graph_def, name="") + ops = graph.get_operations() + dsp_ops = DspOps() + resolved_ops = set() + # convert const node + unresolved_ops = [op for op in ops if op.type == 'Const'] + while len(unresolved_ops) > 0: + convert_ops(unresolved_ops, resolved_ops, net_def, output_node, + dsp_ops) + + # convert op node + unresolved_ops = [op for op in ops if op.type != 'Const'] + while len(unresolved_ops) > 0: + convert_ops(unresolved_ops, resolved_ops, net_def, output_node, + dsp_ops) + + add_output_node(net_def, output_node) + net_def = reverse_batch_to_space_and_biasadd(net_def) + net_def = fuse_quantize(net_def, input_node, output_node) + + sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__') + net_def_with_node_id = add_node_id(sorted_net_def) + + dtype = mace_pb2.DT_FLOAT + final_net_def = add_input_output_info( + net_def_with_node_id, input_node, output_node, graph, dtype) + + arg = final_net_def.arg.add() + arg.name = 'dsp_mode' + arg.i = dsp_mode + + return final_net_def diff --git a/mace/python/tools/tf_ops_stats.py b/mace/python/tools/tf_ops_stats.py index d60487a9..303ec77b 100644 --- a/mace/python/tools/tf_ops_stats.py +++ b/mace/python/tools/tf_ops_stats.py @@ -10,148 +10,174 @@ from tensorflow import gfile FLAGS = None + def hist_inc(hist, key): - if key in hist: - hist[key] += 1 - else: - hist[key] = 1 + if key in hist: + hist[key] += 1 + else: + hist[key] = 1 + def to_int_list(long_list): - int_list = [] - for value in long_list: - int_list.append(int(value)) - return int_list + int_list = [] + for value in long_list: + int_list.append(int(value)) + return int_list + def main(unused_args): - if not FLAGS.input or not gfile.Exists(FLAGS.input): - print('Input graph file ' + FLAGS.input + ' does not exist!') - return -1 - - input_graph_def = tf.GraphDef() - with gfile.Open(FLAGS.input, 'rb') as f: - data = f.read() - input_graph_def.ParseFromString(data) - - with tf.Session() as session: - with session.graph.as_default() as graph: - tf.import_graph_def(input_graph_def, name='') - - stats = {} - ops = graph.get_operations() - # extract kernel size for conv_2d - tensor_shapes = {} - tensor_values = {} - print("=========================consts============================") - for op in ops: - if op.type == 'Const': - for output in op.outputs: - tensor_name = output.name - tensor = output.eval() - tensor_shape = list(tensor.shape) - tensor_shapes[tensor_name] = tensor_shape - print("Const %s: %s, %d" % (tensor_name, tensor_shape, functools.reduce(operator.mul, tensor_shape, 1))) - if len(tensor_shape) == 1 and tensor_shape[0] < 10: - tensor_values[tensor_name] = list(tensor) - - print("=========================ops============================") - for op in ops: - if op.type in ['Conv2D']: - padding = op.get_attr('padding') - strides = to_int_list(op.get_attr('strides')) - data_format = op.get_attr('data_format') - ksize = 'Unknown' - for input in op.inputs: - input_name = input.name - if input_name.endswith('weights/read:0'): - ksize = input.shape.as_list() - break - if input_name.endswith('weights:0') and input_name in tensor_shapes: - ksize = tensor_shapes[input_name] - break - print('%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' % (op.type, padding, strides, ksize, data_format, op.inputs[0].shape, op.outputs[0].shape)) - key = '%s(padding=%s, strides=%s, ksize=%s, format=%s)' % (op.type, padding, strides, ksize, data_format) - hist_inc(stats, key) - elif op.type in ['FusedResizeAndPadConv2D']: - padding = op.get_attr('padding') - strides = to_int_list(op.get_attr('strides')) - resize_align_corners = op.get_attr('resize_align_corners') - ksize = 'Unknown' - for input in op.inputs: - input_name = input.name - if input_name.endswith('weights:0') and input_name in tensor_shapes: - ksize = tensor_shapes[input_name] - break - key = '%s(padding=%s, strides=%s, ksize=%s, resize_align_corners=%s)' % (op.type, padding, strides, ksize, resize_align_corners) - hist_inc(stats, key) - elif op.type in ['ResizeBilinear']: - align_corners = op.get_attr('align_corners') - size = 'Unknown' - for input in op.inputs: - input_name = input.name - if input_name.endswith('size:0') and input_name in tensor_values: - size = tensor_values[input_name] - break - key = '%s(size=%s, align_corners=%s)' % (op.type, size, align_corners) - print(key) - hist_inc(stats, key) - elif op.type in ['AvgPool', 'MaxPool']: - padding = op.get_attr('padding') - strides = to_int_list(op.get_attr('strides')) - ksize = to_int_list(op.get_attr('ksize')) - data_format = op.get_attr('data_format') - key = '%s(padding=%s, strides=%s, ksize=%s)' % (op.type, padding, strides, ksize) - hist_inc(stats, key) - elif op.type in ['SpaceToBatchND', 'BatchToSpaceND']: - block_shape = 'Unknown' - for input in op.inputs: - input_name = input.name - if input_name.endswith('block_shape:0') and input_name in tensor_values: - block_shape = tensor_values[input_name] - break - paddings = 'Unknown' - for input in op.inputs: - input_name = input.name - if input_name.endswith('paddings:0') and input_name in tensor_values: - paddings = tensor_values[input_name] - break - crops = 'Unknown' - for input in op.inputs: - input_name = input.name - if input_name.endswith('crops:0') and input_name in tensor_values: - paddings = tensor_values[input_name] - break - if op.type == 'SpaceToBatchND': - key = '%s(block_shape=%s, paddings=%s)' % (op.type, block_shape, paddings) - else: - key = '%s(block_shape=%s, crops=%s)' % (op.type, block_shape, crops) - print(key) - hist_inc(stats, key) - elif op.type == 'Pad': - paddings = 'Unknown' - for input in op.inputs: - input_name = input.name - if input_name.endswith('paddings:0') and input_name in tensor_values: - paddings = tensor_values[input_name] - break - key = '%s(paddings=%s)' % (op.type, paddings) - hist_inc(stats, key) - else: - hist_inc(stats, op.type) - - print("=========================stats============================") - for key, value in sorted(six.iteritems(stats)): - print('%s: %d' % (key, value)) + if not FLAGS.input or not gfile.Exists(FLAGS.input): + print('Input graph file ' + FLAGS.input + ' does not exist!') + return -1 + + input_graph_def = tf.GraphDef() + with gfile.Open(FLAGS.input, 'rb') as f: + data = f.read() + input_graph_def.ParseFromString(data) + + with tf.Session() as session: + with session.graph.as_default() as graph: + tf.import_graph_def(input_graph_def, name='') + + stats = {} + ops = graph.get_operations() + # extract kernel size for conv_2d + tensor_shapes = {} + tensor_values = {} + print("=========================consts============================") + for op in ops: + if op.type == 'Const': + for output in op.outputs: + tensor_name = output.name + tensor = output.eval() + tensor_shape = list(tensor.shape) + tensor_shapes[tensor_name] = tensor_shape + print("Const %s: %s, %d" % + (tensor_name, tensor_shape, + functools.reduce(operator.mul, tensor_shape, 1))) + if len(tensor_shape) == 1 and tensor_shape[0] < 10: + tensor_values[tensor_name] = list(tensor) + + print("=========================ops============================") + for op in ops: + if op.type in ['Conv2D']: + padding = op.get_attr('padding') + strides = to_int_list(op.get_attr('strides')) + data_format = op.get_attr('data_format') + ksize = 'Unknown' + for input in op.inputs: + input_name = input.name + if input_name.endswith('weights/read:0'): + ksize = input.shape.as_list() + break + if input_name.endswith( + 'weights:0') and input_name in tensor_shapes: + ksize = tensor_shapes[input_name] + break + print( + '%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' + % (op.type, padding, strides, ksize, data_format, + op.inputs[0].shape, op.outputs[0].shape)) + key = '%s(padding=%s, strides=%s, ksize=%s, format=%s)' % ( + op.type, padding, strides, ksize, data_format) + hist_inc(stats, key) + elif op.type in ['FusedResizeAndPadConv2D']: + padding = op.get_attr('padding') + strides = to_int_list(op.get_attr('strides')) + resize_align_corners = op.get_attr('resize_align_corners') + ksize = 'Unknown' + for input in op.inputs: + input_name = input.name + if input_name.endswith( + 'weights:0') and input_name in tensor_shapes: + ksize = tensor_shapes[input_name] + break + key = '%s(padding=%s, strides=%s, ksize=%s, ' \ + 'resize_align_corners=%s)' % (op.type, padding, strides, + ksize, resize_align_corners) + hist_inc(stats, key) + elif op.type in ['ResizeBilinear']: + align_corners = op.get_attr('align_corners') + size = 'Unknown' + for input in op.inputs: + input_name = input.name + if input_name.endswith( + 'size:0') and input_name in tensor_values: + size = tensor_values[input_name] + break + key = '%s(size=%s, align_corners=%s)' % (op.type, size, + align_corners) + print(key) + hist_inc(stats, key) + elif op.type in ['AvgPool', 'MaxPool']: + padding = op.get_attr('padding') + strides = to_int_list(op.get_attr('strides')) + ksize = to_int_list(op.get_attr('ksize')) + data_format = op.get_attr('data_format') + key = '%s(padding=%s, strides=%s, ksize=%s)' % (op.type, + padding, + strides, ksize) + hist_inc(stats, key) + elif op.type in ['SpaceToBatchND', 'BatchToSpaceND']: + block_shape = 'Unknown' + for input in op.inputs: + input_name = input.name + if input_name.endswith( + 'block_shape:0') and input_name in tensor_values: + block_shape = tensor_values[input_name] + break + paddings = 'Unknown' + for input in op.inputs: + input_name = input.name + if input_name.endswith( + 'paddings:0') and input_name in tensor_values: + paddings = tensor_values[input_name] + break + crops = 'Unknown' + for input in op.inputs: + input_name = input.name + if input_name.endswith( + 'crops:0') and input_name in tensor_values: + paddings = tensor_values[input_name] + break + if op.type == 'SpaceToBatchND': + key = '%s(block_shape=%s, paddings=%s)' % (op.type, + block_shape, + paddings) + else: + key = '%s(block_shape=%s, crops=%s)' % (op.type, + block_shape, crops) + print(key) + hist_inc(stats, key) + elif op.type == 'Pad': + paddings = 'Unknown' + for input in op.inputs: + input_name = input.name + if input_name.endswith( + 'paddings:0') and input_name in tensor_values: + paddings = tensor_values[input_name] + break + key = '%s(paddings=%s)' % (op.type, paddings) + hist_inc(stats, key) + else: + hist_inc(stats, op.type) + + print("=========================stats============================") + for key, value in sorted(six.iteritems(stats)): + print('%s: %d' % (key, value)) + def parse_args(): - '''Parses command line arguments.''' - parser = argparse.ArgumentParser() - parser.add_argument( - '--input', - type=str, - default='', - help='TensorFlow \'GraphDef\' file to load.') - return parser.parse_known_args() + '''Parses command line arguments.''' + parser = argparse.ArgumentParser() + parser.add_argument( + '--input', + type=str, + default='', + help='TensorFlow \'GraphDef\' file to load.') + return parser.parse_known_args() + if __name__ == '__main__': - FLAGS, unparsed = parse_args() - main(unused_args=[sys.argv[0]] + unparsed) + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py index c8c9ddb4..88d8c181 100644 --- a/tools/bazel_adb_run.py +++ b/tools/bazel_adb_run.py @@ -7,7 +7,6 @@ # --target=//mace/ops:ops_test # --stdout_processor=stdout_processor - import argparse import random import re @@ -15,104 +14,113 @@ import sys import sh_commands + def stdout_processor(stdout, device_properties, abi): - pass + pass + def ops_test_stdout_processor(stdout, device_properties, abi): - stdout_lines = stdout.split("\n") - for line in stdout_lines: - if "Aborted" in line or "FAILED" in line: - raise Exception("Command failed") + stdout_lines = stdout.split("\n") + for line in stdout_lines: + if "Aborted" in line or "FAILED" in line: + raise Exception("Command failed") + def ops_benchmark_stdout_processor(stdout, device_properties, abi): - stdout_lines = stdout.split("\n") - metrics = {} - for line in stdout_lines: - if "Aborted" in line: - raise Exception("Command failed") - line = line.strip() - parts = line.split() - if len(parts) == 5 and parts[0].startswith("BM_"): - metrics["%s.time_ms" % parts[0]] = str(float(parts[1])/1e6) - metrics["%s.input_mb_per_sec" % parts[0]] = parts[3] - metrics["%s.gmacc_per_sec" % parts[0]] = parts[4] - - platform = device_properties["ro.board.platform"].replace(" ", "-") - model = device_properties["ro.product.model"].replace(" ", "-") - tags = {"ro.board.platform": platform, - "ro.product.model": model, - "abi": abi} - sh_commands.falcon_push_metrics(metrics, tags=tags, - endpoint="mace_ops_benchmark") + stdout_lines = stdout.split("\n") + metrics = {} + for line in stdout_lines: + if "Aborted" in line: + raise Exception("Command failed") + line = line.strip() + parts = line.split() + if len(parts) == 5 and parts[0].startswith("BM_"): + metrics["%s.time_ms" % parts[0]] = str(float(parts[1]) / 1e6) + metrics["%s.input_mb_per_sec" % parts[0]] = parts[3] + metrics["%s.gmacc_per_sec" % parts[0]] = parts[4] + + platform = device_properties["ro.board.platform"].replace(" ", "-") + model = device_properties["ro.product.model"].replace(" ", "-") + tags = { + "ro.board.platform": platform, + "ro.product.model": model, + "abi": abi + } + sh_commands.falcon_push_metrics( + metrics, tags=tags, endpoint="mace_ops_benchmark") + def parse_args(): - """Parses command line arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument( - "--target_abis", - type=str, - default="armeabi-v7a", - help="Target ABIs, comma seperated list") - parser.add_argument( - "--target_socs", - type=str, - default="all", - help="SoCs(ro.board.platform) to build, comma seperated list or all/random") - parser.add_argument( - "--target", - type=str, - default="//...", - help="Bazel target to build") - parser.add_argument( - "--run_target", - type=bool, - default=False, - help="Whether to run the target") - parser.add_argument( - "--args", - type=str, - default="", - help="Command args") - parser.add_argument( - "--stdout_processor", - type=str, - default="stdout_processor", - help="Stdout processing function, default: stdout_processor") - return parser.parse_known_args() + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--target_abis", + type=str, + default="armeabi-v7a", + help="Target ABIs, comma seperated list") + parser.add_argument( + "--target_socs", + type=str, + default="all", + help="SoCs (ro.board.platform from getprop) to build, " + "comma seperated list or all/random") + parser.add_argument( + "--target", type=str, default="//...", help="Bazel target to build") + parser.add_argument( + "--run_target", + type=bool, + default=False, + help="Whether to run the target") + parser.add_argument("--args", type=str, default="", help="Command args") + parser.add_argument( + "--stdout_processor", + type=str, + default="stdout_processor", + help="Stdout processing function, default: stdout_processor") + return parser.parse_known_args() + def main(unused_args): - target_socs = None - if FLAGS.target_socs != "all" and FLAGS.target_socs != "random": - target_socs = set(FLAGS.target_socs.split(',')) - target_devices = sh_commands.adb_devices(target_socs=target_socs) - if FLAGS.target_socs == "random": - target_devices = [random.choice(target_devices)] - - target = FLAGS.target - host_bin_path, bin_name = sh_commands.bazel_target_to_bin(target) - target_abis = FLAGS.target_abis.split(',') - - # generate sources - sh_commands.gen_encrypted_opencl_source() - sh_commands.gen_compiled_opencl_source() - sh_commands.gen_mace_version() - - for target_abi in target_abis: - sh_commands.bazel_build(target, abi=target_abi) - if FLAGS.run_target: - for serialno in target_devices: - if target_abi not in set(sh_commands.adb_supported_abis(serialno)): - print("Skip device %s which does not support ABI %s" % (serialno, target_abi)) - continue - stdouts = sh_commands.adb_run(serialno, host_bin_path, bin_name, - args=FLAGS.args, - opencl_profiling=1, - vlog_level=0, - device_bin_path="/data/local/tmp/mace", - out_of_range_check=1) - device_properties = sh_commands.adb_getprop_by_serialno(serialno) - globals()[FLAGS.stdout_processor](stdouts, device_properties, target_abi) + target_socs = None + if FLAGS.target_socs != "all" and FLAGS.target_socs != "random": + target_socs = set(FLAGS.target_socs.split(',')) + target_devices = sh_commands.adb_devices(target_socs=target_socs) + if FLAGS.target_socs == "random": + target_devices = [random.choice(target_devices)] + + target = FLAGS.target + host_bin_path, bin_name = sh_commands.bazel_target_to_bin(target) + target_abis = FLAGS.target_abis.split(',') + + # generate sources + sh_commands.gen_encrypted_opencl_source() + sh_commands.gen_compiled_opencl_source() + sh_commands.gen_mace_version() + + for target_abi in target_abis: + sh_commands.bazel_build(target, abi=target_abi) + if FLAGS.run_target: + for serialno in target_devices: + if target_abi not in set( + sh_commands.adb_supported_abis(serialno)): + print("Skip device %s which does not support ABI %s" % + (serialno, target_abi)) + continue + stdouts = sh_commands.adb_run( + serialno, + host_bin_path, + bin_name, + args=FLAGS.args, + opencl_profiling=1, + vlog_level=0, + device_bin_path="/data/local/tmp/mace", + out_of_range_check=1) + device_properties = sh_commands.adb_getprop_by_serialno( + serialno) + globals()[FLAGS.stdout_processor](stdouts, device_properties, + target_abi) + if __name__ == "__main__": - FLAGS, unparsed = parse_args() - main(unused_args=[sys.argv[0]] + unparsed) + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/tools/falcon_cli.py b/tools/falcon_cli.py index 7debe05c..de0fa298 100644 --- a/tools/falcon_cli.py +++ b/tools/falcon_cli.py @@ -1,9 +1,9 @@ -#-*- coding:utf8 -*- +import json +import socket +import itertools -import json, socket, itertools class FalconCli(object): - def __init__(self, addr, debug=True, buf_size=1000): self.socket_ = socket.create_connection(addr) self.stream = self.socket_.makefile() @@ -16,16 +16,19 @@ class FalconCli(object): self.stream.close() @classmethod - def connect(cls, server="transfer.falcon.miliao.srv", port=8433, debug=True, buf_size=1000): + def connect(cls, + server="transfer.falcon.miliao.srv", + port=8433, + debug=True, + buf_size=1000): try: return FalconCli((server, port), debug, buf_size) except socket.error, exc: - print "error: connect to %s:%s error: %s" %(server, port, exc) + print "error: connect to %s:%s error: %s" % (server, port, exc) def call(self, name, *params): - request = dict(id=next(self.id_counter), - params=list(params), - method=name) + request = dict( + id=next(self.id_counter), params=list(params), method=name) payload = json.dumps(request).encode() if self.debug: print "--> req:", payload @@ -49,7 +52,7 @@ class FalconCli(object): resp = [] while True: - buf = lines[s:s+self.buf_size] + buf = lines[s:s + self.buf_size] s = s + self.buf_size if len(buf) == 0: break @@ -57,4 +60,3 @@ class FalconCli(object): resp.append(r) return resp - diff --git a/tools/generate_data.py b/tools/generate_data.py index 8feff823..d9dda24b 100644 --- a/tools/generate_data.py +++ b/tools/generate_data.py @@ -11,45 +11,40 @@ import re # --input_file input_file # + def generate_data(name, shape): - np.random.seed() - data = np.random.random(shape) * 2 - 1 - input_file_name = FLAGS.input_file + "_" + re.sub('[^0-9a-zA-Z]+', '_', name) - print 'Generate input file: ', input_file_name - data.astype(np.float32).tofile(input_file_name) + np.random.seed() + data = np.random.random(shape) * 2 - 1 + input_file_name = FLAGS.input_file + "_" + re.sub('[^0-9a-zA-Z]+', '_', + name) + print 'Generate input file: ', input_file_name + data.astype(np.float32).tofile(input_file_name) + def main(unused_args): - input_names = [name for name in FLAGS.input_node.split(',')] - input_shapes = [shape for shape in FLAGS.input_shape.split(':')] - assert len(input_names) == len(input_shapes) - for i in range(len(input_names)): - shape = [int(x) for x in input_shapes[i].split(',')] - generate_data(input_names[i], shape) - print "Generate input file done." + input_names = [name for name in FLAGS.input_node.split(',')] + input_shapes = [shape for shape in FLAGS.input_shape.split(':')] + assert len(input_names) == len(input_shapes) + for i in range(len(input_names)): + shape = [int(x) for x in input_shapes[i].split(',')] + generate_data(input_names[i], shape) + print "Generate input file done." + def parse_args(): - """Parses command line arguments.""" - parser = argparse.ArgumentParser() - parser.register("type", "bool", lambda v: v.lower() == "true") - parser.add_argument( - "--input_file", - type=str, - default="", - help="input file.") - parser.add_argument( - "--input_node", - type=str, - default="input_node", - help="input node") - parser.add_argument( - "--input_shape", - type=str, - default="1,64,64,3", - help="input shape.") - - return parser.parse_known_args() + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.register("type", "bool", lambda v: v.lower() == "true") + parser.add_argument( + "--input_file", type=str, default="", help="input file.") + parser.add_argument( + "--input_node", type=str, default="input_node", help="input node") + parser.add_argument( + "--input_shape", type=str, default="1,64,64,3", help="input shape.") + + return parser.parse_known_args() -if __name__ == '__main__': - FLAGS, unparsed = parse_args() - main(unused_args=[sys.argv[0]] + unparsed) +if __name__ == '__main__': + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/tools/mace_tools.py b/tools/mace_tools.py index d229573a..7f2ef056 100644 --- a/tools/mace_tools.py +++ b/tools/mace_tools.py @@ -23,124 +23,135 @@ from ConfigParser import ConfigParser def run_command(command): - print("Run command: {}".format(command)) - result = subprocess.Popen( - command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = result.communicate() + print("Run command: {}".format(command)) + result = subprocess.Popen( + command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = result.communicate() - if out: - print("Stdout msg:\n{}".format(out)) - if err: - print("Stderr msg:\n{}".format(err)) + if out: + print("Stdout msg:\n{}".format(out)) + if err: + print("Stderr msg:\n{}".format(err)) - if result.returncode != 0: - raise Exception("Exit not 0 from bash with code: {}, command: {}".format( - result.returncode, command)) + if result.returncode != 0: + raise Exception( + "Exit not 0 from bash with code: {}, command: {}".format( + result.returncode, command)) def get_global_runtime(configs): - runtime_list = [] - for model_name in configs["models"]: - model_runtime = configs["models"][model_name]["runtime"] - runtime_list.append(model_runtime.lower()) - - global_runtime = "" - if "dsp" in runtime_list: - global_runtime = "dsp" - elif "gpu" in runtime_list: - global_runtime = "gpu" - elif "cpu" in runtime_list: - global_runtime = "cpu" - elif "neon" in runtime_list: - global_runtime = "neon" - else: - raise Exception("Not found available RUNTIME in config files!") - - return global_runtime + runtime_list = [] + for model_name in configs["models"]: + model_runtime = configs["models"][model_name]["runtime"] + runtime_list.append(model_runtime.lower()) + + global_runtime = "" + if "dsp" in runtime_list: + global_runtime = "dsp" + elif "gpu" in runtime_list: + global_runtime = "gpu" + elif "cpu" in runtime_list: + global_runtime = "cpu" + elif "neon" in runtime_list: + global_runtime = "neon" + else: + raise Exception("Not found available RUNTIME in config files!") + + return global_runtime def generate_version_code(): - command = "bash tools/generate_version_code.sh" - run_command(command) + command = "bash tools/generate_version_code.sh" + run_command(command) + def generate_opencl_source_code(): - command = "bash tools/generate_opencl_code.sh source" - run_command(command) + command = "bash tools/generate_opencl_code.sh source" + run_command(command) + def generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not): - cl_bin_dirs = [] - for d in model_output_dirs: - cl_bin_dirs.append(os.path.join(d, "opencl_bin")) - cl_bin_dirs_str = ",".join(cl_bin_dirs) - if not cl_bin_dirs: - command = "bash tools/generate_opencl_code.sh binary" - else: - command = "bash tools/generate_opencl_code.sh {} {} {} {}".format( - 'binary', target_soc, cl_bin_dirs_str, int(pull_or_not)) - run_command(command) + cl_bin_dirs = [] + for d in model_output_dirs: + cl_bin_dirs.append(os.path.join(d, "opencl_bin")) + cl_bin_dirs_str = ",".join(cl_bin_dirs) + if not cl_bin_dirs: + command = "bash tools/generate_opencl_code.sh binary" + else: + command = "bash tools/generate_opencl_code.sh {} {} {} {}".format( + 'binary', target_soc, cl_bin_dirs_str, int(pull_or_not)) + run_command(command) + def generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not): - cl_bin_dirs = [] - for d in model_output_dirs: - cl_bin_dirs.append(os.path.join(d, "opencl_bin")) - cl_bin_dirs_str = ",".join(cl_bin_dirs) - if not cl_bin_dirs: - command = "bash tools/generate_tuning_param_code.sh" - else: - command = "bash tools/generate_tuning_param_code.sh {} {} {}".format( - target_soc, cl_bin_dirs_str, int(pull_or_not)) - run_command(command) + cl_bin_dirs = [] + for d in model_output_dirs: + cl_bin_dirs.append(os.path.join(d, "opencl_bin")) + cl_bin_dirs_str = ",".join(cl_bin_dirs) + if not cl_bin_dirs: + command = "bash tools/generate_tuning_param_code.sh" + else: + command = "bash tools/generate_tuning_param_code.sh {} {} {}".format( + target_soc, cl_bin_dirs_str, int(pull_or_not)) + run_command(command) + def generate_code(target_soc, model_output_dirs, pull_or_not): - generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not) - generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not) + generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not) + generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not) + def clear_env(target_soc): - command = "bash tools/clear_env.sh {}".format(target_soc) - run_command(command) + command = "bash tools/clear_env.sh {}".format(target_soc) + run_command(command) + def input_file_name(input_name): - return os.environ['INPUT_FILE_NAME'] + '_' + \ - re.sub('[^0-9a-zA-Z]+', '_', input_name) - -def generate_random_input(target_soc, model_output_dir, - input_names, input_files): - generate_data_or_not = True - command = "bash tools/validate_tools.sh {} {} {}".format( - target_soc, model_output_dir, int(generate_data_or_not)) - run_command(command) - - input_file_list = [] - if isinstance(input_files, list): - input_file_list.extend(input_files) - else: - input_file_list.append(input_files) - if len(input_file_list) != 0: - input_name_list = [] - if isinstance(input_names, list): - input_name_list.extend(input_names) + return os.environ['INPUT_FILE_NAME'] + '_' + \ + re.sub('[^0-9a-zA-Z]+', '_', input_name) + + +def generate_random_input(target_soc, model_output_dir, input_names, + input_files): + generate_data_or_not = True + command = "bash tools/validate_tools.sh {} {} {}".format( + target_soc, model_output_dir, int(generate_data_or_not)) + run_command(command) + + input_file_list = [] + if isinstance(input_files, list): + input_file_list.extend(input_files) else: - input_name_list.append(input_names) - if len(input_file_list) != len(input_name_list): - raise Exception('If input_files set, the input files should match the input names.') - for i in range(len(input_file_list)): - if input_file_list[i] is not None: - dst_input_file = model_output_dir + '/' + input_file_name(input_name_list[i]) - if input_file_list[i].startswith("http://") or \ - input_file_list[i].startswith("https://"): - urllib.urlretrieve(input_file_list[i], dst_input_file) + input_file_list.append(input_files) + if len(input_file_list) != 0: + input_name_list = [] + if isinstance(input_names, list): + input_name_list.extend(input_names) else: - shutil.copy(input_file_list[i], dst_input_file) + input_name_list.append(input_names) + if len(input_file_list) != len(input_name_list): + raise Exception('If input_files set, the input files should ' + 'match the input names.') + for i in range(len(input_file_list)): + if input_file_list[i] is not None: + dst_input_file = model_output_dir + '/' + input_file_name( + input_name_list[i]) + if input_file_list[i].startswith("http://") or \ + input_file_list[i].startswith("https://"): + urllib.urlretrieve(input_file_list[i], dst_input_file) + else: + shutil.copy(input_file_list[i], dst_input_file) + def generate_model_code(): - command = "bash tools/generate_model_code.sh" - run_command(command) + command = "bash tools/generate_model_code.sh" + run_command(command) def build_mace_run(production_mode, model_output_dir, hexagon_mode): - command = "bash tools/build_mace_run.sh {} {} {}".format( - int(production_mode), model_output_dir, int(hexagon_mode)) - run_command(command) + command = "bash tools/build_mace_run.sh {} {} {}".format( + int(production_mode), model_output_dir, int(hexagon_mode)) + run_command(command) def tuning_run(model_name, @@ -152,301 +163,328 @@ def tuning_run(model_name, tuning, restart_round, option_args=''): - # TODO(yejianwu) refactoring the hackish code - stdout_buff = [] - process_output = sh_commands.make_output_processor(stdout_buff) - p = sh.bash("tools/tuning_run.sh", target_soc, model_output_dir, - running_round, int(tuning), - restart_round, option_args, _out=process_output, - _bg=True, _err_to_out=True) - p.wait() - metrics = {} - for line in stdout_buff: - line = line.strip() - parts = line.split() - if len(parts) == 6 and parts[0].startswith("time"): - metrics["%s.create_net_ms" % model_name] = str(float(parts[1])) - metrics["%s.mace_engine_ctor_ms" % model_name] = str(float(parts[2])) - metrics["%s.init_ms" % model_name] = str(float(parts[3])) - metrics["%s.warmup_ms" % model_name] = str(float(parts[4])) - if float(parts[5]) > 0: - metrics["%s.avg_latency_ms" % model_name] = str(float(parts[5])) - tags = {"ro.board.platform": target_soc, - "abi": target_abi, - # "runtime": target_runtime, # TODO(yejianwu) Add the actual runtime - "round": running_round, # TODO(yejianwu) change this to source/binary - "tuning": tuning} - sh_commands.falcon_push_metrics(metrics, endpoint="mace_model_benchmark", - tags=tags) + # TODO(yejianwu) refactoring the hackish code + stdout_buff = [] + process_output = sh_commands.make_output_processor(stdout_buff) + p = sh.bash( + "tools/tuning_run.sh", + target_soc, + model_output_dir, + running_round, + int(tuning), + restart_round, + option_args, + _out=process_output, + _bg=True, + _err_to_out=True) + p.wait() + metrics = {} + for line in stdout_buff: + line = line.strip() + parts = line.split() + if len(parts) == 6 and parts[0].startswith("time"): + metrics["%s.create_net_ms" % model_name] = str(float(parts[1])) + metrics["%s.mace_engine_ctor_ms" % model_name] = str( + float(parts[2])) + metrics["%s.init_ms" % model_name] = str(float(parts[3])) + metrics["%s.warmup_ms" % model_name] = str(float(parts[4])) + if float(parts[5]) > 0: + metrics["%s.avg_latency_ms" % model_name] = str( + float(parts[5])) + tags = { + "ro.board.platform": target_soc, + "abi": target_abi, + # "runtime": target_runtime, # TODO(yejianwu) Add the actual runtime + "round": running_round, # TODO(yejianwu) change this to source/binary + "tuning": tuning + } + sh_commands.falcon_push_metrics( + metrics, endpoint="mace_model_benchmark", tags=tags) + def benchmark_model(target_soc, model_output_dir, option_args=''): - command = "bash tools/benchmark.sh {} {} \"{}\"".format( - target_soc, model_output_dir, option_args) - run_command(command) + command = "bash tools/benchmark.sh {} {} \"{}\"".format( + target_soc, model_output_dir, option_args) + run_command(command) def run_model(model_name, target_runtime, target_abi, target_soc, model_output_dir, running_round, restart_round, option_args): - tuning_run(model_name, target_runtime, target_abi, target_soc, - model_output_dir, running_round, False, - restart_round, option_args) + tuning_run(model_name, target_runtime, target_abi, target_soc, + model_output_dir, running_round, False, restart_round, + option_args) def generate_production_code(target_soc, model_output_dirs, pull_or_not): - cl_bin_dirs = [] - for d in model_output_dirs: - cl_bin_dirs.append(os.path.join(d, "opencl_bin")) - cl_bin_dirs_str = ",".join(cl_bin_dirs) - command = "bash tools/generate_production_code.sh {} {} {}".format( - target_soc, cl_bin_dirs_str, int(pull_or_not)) - run_command(command) + cl_bin_dirs = [] + for d in model_output_dirs: + cl_bin_dirs.append(os.path.join(d, "opencl_bin")) + cl_bin_dirs_str = ",".join(cl_bin_dirs) + command = "bash tools/generate_production_code.sh {} {} {}".format( + target_soc, cl_bin_dirs_str, int(pull_or_not)) + run_command(command) def build_mace_run_prod(model_name, target_runtime, target_abi, target_soc, model_output_dir, tuning): - if "dsp" == target_runtime: - hexagon_mode = True - else: - hexagon_mode = False - - generate_code(target_soc, [], False) - production_or_not = False - build_mace_run(production_or_not, model_output_dir, hexagon_mode) - tuning_run( - model_name, - target_runtime, - target_abi, - target_soc, - model_output_dir, - running_round=0, - tuning=tuning, - restart_round=1) - - generate_code(target_soc, [model_output_dir], True) - production_or_not = True - build_mace_run(production_or_not, model_output_dir, hexagon_mode) + if "dsp" == target_runtime: + hexagon_mode = True + else: + hexagon_mode = False + + generate_code(target_soc, [], False) + production_or_not = False + build_mace_run(production_or_not, model_output_dir, hexagon_mode) + tuning_run( + model_name, + target_runtime, + target_abi, + target_soc, + model_output_dir, + running_round=0, + tuning=tuning, + restart_round=1) + + generate_code(target_soc, [model_output_dir], True) + production_or_not = True + build_mace_run(production_or_not, model_output_dir, hexagon_mode) def build_run_throughput_test(target_soc, run_seconds, merged_lib_file, model_input_dir): - command = "bash tools/build_run_throughput_test.sh {} {} {} {}".format( - target_soc, run_seconds, merged_lib_file, model_input_dir) - run_command(command) + command = "bash tools/build_run_throughput_test.sh {} {} {} {}".format( + target_soc, run_seconds, merged_lib_file, model_input_dir) + run_command(command) def validate_model(target_soc, model_output_dir): - generate_data_or_not = False - command = "bash tools/validate_tools.sh {} {} {}".format( - target_soc, model_output_dir, int(generate_data_or_not)) - run_command(command) + generate_data_or_not = False + command = "bash tools/validate_tools.sh {} {} {}".format( + target_soc, model_output_dir, int(generate_data_or_not)) + run_command(command) def build_production_code(): - command = "bash tools/build_production_code.sh" - run_command(command) + command = "bash tools/build_production_code.sh" + run_command(command) def merge_libs_and_tuning_results(target_soc, output_dir, model_output_dirs): - generate_code(target_soc, model_output_dirs, False) - build_production_code() + generate_code(target_soc, model_output_dirs, False) + build_production_code() - model_output_dirs_str = ",".join(model_output_dirs) - command = "bash tools/merge_libs.sh {} {} {}".format(target_soc, output_dir, - model_output_dirs_str) - run_command(command) + model_output_dirs_str = ",".join(model_output_dirs) + command = "bash tools/merge_libs.sh {} {} {}".format( + target_soc, output_dir, model_output_dirs_str) + run_command(command) def packaging_lib_file(output_dir): - command = "bash tools/packaging_lib.sh {}".format(output_dir) - run_command(command) + command = "bash tools/packaging_lib.sh {}".format(output_dir) + run_command(command) + def download_model_files(model_file_path, model_output_dir, weight_file_path=""): - if model_file_path.startswith("http://") or \ - model_file_path.startswith("https://"): - os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb" - urllib.urlretrieve(model_file_path, os.environ["MODEL_FILE_PATH"]) - - if weight_file_path.startswith("http://") or \ - weight_file_path.startswith("https://"): - os.environ[ - "WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel" - urllib.urlretrieve(weight_file_path, - os.environ["WEIGHT_FILE_PATH"]) + if model_file_path.startswith("http://") or \ + model_file_path.startswith("https://"): + os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb" + urllib.urlretrieve(model_file_path, os.environ["MODEL_FILE_PATH"]) + + if weight_file_path.startswith("http://") or \ + weight_file_path.startswith("https://"): + os.environ["WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel" + urllib.urlretrieve(weight_file_path, os.environ["WEIGHT_FILE_PATH"]) + def md5sum(str): - md5 = hashlib.md5() - md5.update(str) - return md5.hexdigest() + md5 = hashlib.md5() + md5.update(str) + return md5.hexdigest() def parse_model_configs(): - with open(FLAGS.config) as f: - configs = yaml.load(f) - return configs + with open(FLAGS.config) as f: + configs = yaml.load(f) + return configs def parse_args(): - """Parses command line arguments.""" - parser = argparse.ArgumentParser() - parser.register("type", "bool", lambda v: v.lower() == "true") - parser.add_argument( - "--config", - type=str, - default="./tool/config", - help="The global config file of models.") - parser.add_argument( - "--output_dir", type=str, default="build", help="The output dir.") - parser.add_argument( - "--round", type=int, default=1, help="The model running round.") - parser.add_argument( - "--run_seconds", - type=int, - default=10, - help="The model throughput test running seconds.") - parser.add_argument( - "--restart_round", type=int, default=1, help="The model restart round.") - parser.add_argument( - "--tuning", type="bool", default="true", help="Tune opencl params.") - parser.add_argument( - "--mode", - type=str, - default="all", - help="[build|run|validate|merge|all|throughput_test].") - parser.add_argument( - "--target_socs", - type=str, - default="all", - help="SoCs to build, comma seperated list (getprop ro.board.platform)") - return parser.parse_known_args() + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.register("type", "bool", lambda v: v.lower() == "true") + parser.add_argument( + "--config", + type=str, + default="./tool/config", + help="The global config file of models.") + parser.add_argument( + "--output_dir", type=str, default="build", help="The output dir.") + parser.add_argument( + "--round", type=int, default=1, help="The model running round.") + parser.add_argument( + "--run_seconds", + type=int, + default=10, + help="The model throughput test running seconds.") + parser.add_argument( + "--restart_round", + type=int, + default=1, + help="The model restart round.") + parser.add_argument( + "--tuning", type="bool", default="true", help="Tune opencl params.") + parser.add_argument( + "--mode", + type=str, + default="all", + help="[build|run|validate|merge|all|throughput_test].") + parser.add_argument( + "--target_socs", + type=str, + default="all", + help="SoCs to build, comma seperated list (getprop ro.board.platform)") + return parser.parse_known_args() + def set_environment(configs): - os.environ["EMBED_MODEL_DATA"] = str(configs["embed_model_data"]) - os.environ["VLOG_LEVEL"] = str(configs["vlog_level"]) - os.environ["PROJECT_NAME"] = os.path.splitext(os.path.basename( - FLAGS.config))[0] - os.environ['INPUT_FILE_NAME'] = "model_input" - os.environ['OUTPUT_FILE_NAME'] = "model_out" + os.environ["EMBED_MODEL_DATA"] = str(configs["embed_model_data"]) + os.environ["VLOG_LEVEL"] = str(configs["vlog_level"]) + os.environ["PROJECT_NAME"] = os.path.splitext( + os.path.basename(FLAGS.config))[0] + os.environ['INPUT_FILE_NAME'] = "model_input" + os.environ['OUTPUT_FILE_NAME'] = "model_out" + def main(unused_args): - configs = parse_model_configs() - - if FLAGS.mode == "validate": - FLAGS.round = 1 - FLAGS.restart_round = 1 - - set_environment(configs) - - if FLAGS.mode == "build" or FLAGS.mode == "all": - # Remove previous output dirs - if not os.path.exists(FLAGS.output_dir): - os.makedirs(FLAGS.output_dir) - elif os.path.exists(os.path.join(FLAGS.output_dir, "libmace")): - shutil.rmtree(os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"])) - os.makedirs(os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"])) - - generate_version_code() - generate_opencl_source_code() - - option_args = ' '.join([arg for arg in unused_args if arg.startswith('--')]) - - available_socs = sh_commands.adb_get_all_socs() - target_socs = available_socs - if hasattr(configs, "target_socs"): - target_socs = set(configs["target_socs"]) - target_socs = target_socs & available_socs - - if FLAGS.target_socs != "all": - socs = set(FLAGS.target_socs.split(',')) - target_socs = target_socs & socs - missing_socs = socs.difference(target_socs) - if len(missing_socs) > 0: - print("Error: devices with SoCs are not connected %s" % missing_socs) - exit(1) - - - for target_soc in target_socs: - for target_abi in configs["target_abis"]: - global_runtime = get_global_runtime(configs) - # Transfer params by environment - os.environ["TARGET_ABI"] = target_abi - model_output_dirs = [] - for model_name in configs["models"]: - print '=======================', model_name, '=======================' - # Transfer params by environment - os.environ["MODEL_TAG"] = model_name - model_config = configs["models"][model_name] - input_file_list = model_config.get("validation_inputs_data", []) - for key in model_config: - if key in ['input_nodes', 'output_nodes'] and isinstance( - model_config[key], list): - os.environ[key.upper()] = ",".join(model_config[key]) - elif key in ['input_shapes', 'output_shapes'] and isinstance( - model_config[key], list): - os.environ[key.upper()] = ":".join(model_config[key]) - else: - os.environ[key.upper()] = str(model_config[key]) - - # Create model build directory - model_path_digest = md5sum(model_config["model_file_path"]) - model_output_dir = "%s/%s/%s/%s/%s/%s/%s" % (FLAGS.output_dir, - os.environ["PROJECT_NAME"], - "build", model_name, - model_path_digest, - target_soc, target_abi) - model_output_dirs.append(model_output_dir) - - if FLAGS.mode == "build" or FLAGS.mode == "all": - if os.path.exists(model_output_dir): - shutil.rmtree(model_output_dir) - os.makedirs(model_output_dir) - clear_env(target_soc) - - download_model_files(model_config["model_file_path"], - model_output_dir, model_config.get("weight_file_path", "")) - - if FLAGS.mode == "build" or FLAGS.mode == "run" or FLAGS.mode == "validate"\ - or FLAGS.mode == "benchmark" or FLAGS.mode == "all": - generate_random_input(target_soc, model_output_dir, - model_config['input_nodes'], input_file_list) - - if FLAGS.mode == "build" or FLAGS.mode == "all": - generate_model_code() - build_mace_run_prod(model_name, global_runtime, target_abi, - target_soc, model_output_dir, FLAGS.tuning) - - if FLAGS.mode == "run" or FLAGS.mode == "validate" or FLAGS.mode == "all": - run_model(model_name, global_runtime, target_abi, target_soc, - model_output_dir, FLAGS.round, FLAGS.restart_round, - option_args) - - if FLAGS.mode == "benchmark": - benchmark_model(target_soc, model_output_dir, option_args) - - if FLAGS.mode == "validate" or FLAGS.mode == "all": - validate_model(target_soc, model_output_dir) - - if FLAGS.mode == "build" or FLAGS.mode == "merge" or FLAGS.mode == "all": - merge_libs_and_tuning_results( - target_soc, FLAGS.output_dir + "/" + os.environ["PROJECT_NAME"], - model_output_dirs) - - if FLAGS.mode == "throughput_test": - merged_lib_file = FLAGS.output_dir + "/%s/%s/libmace_%s.%s.a" % \ - (os.environ["PROJECT_NAME"], target_abi, os.environ["PROJECT_NAME"], target_soc) - generate_random_input(target_soc, FLAGS.output_dir, [], []) - for model_name in configs["models"]: - runtime = configs["models"][model_name]["runtime"] - os.environ["%s_MODEL_TAG" % runtime.upper()] = model_name - build_run_throughput_test(target_soc, FLAGS.run_seconds, - merged_lib_file, FLAGS.output_dir) - - if FLAGS.mode == "build" or FLAGS.mode == "all": - packaging_lib_file(FLAGS.output_dir) + configs = parse_model_configs() + + if FLAGS.mode == "validate": + FLAGS.round = 1 + FLAGS.restart_round = 1 + + set_environment(configs) + + if FLAGS.mode == "build" or FLAGS.mode == "all": + # Remove previous output dirs + if not os.path.exists(FLAGS.output_dir): + os.makedirs(FLAGS.output_dir) + elif os.path.exists(os.path.join(FLAGS.output_dir, "libmace")): + shutil.rmtree( + os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"])) + os.makedirs( + os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"])) + + generate_version_code() + generate_opencl_source_code() + + option_args = ' '.join( + [arg for arg in unused_args if arg.startswith('--')]) + + available_socs = sh_commands.adb_get_all_socs() + target_socs = available_socs + if hasattr(configs, "target_socs"): + target_socs = set(configs["target_socs"]) + target_socs = target_socs & available_socs + + if FLAGS.target_socs != "all": + socs = set(FLAGS.target_socs.split(',')) + target_socs = target_socs & socs + missing_socs = socs.difference(target_socs) + if len(missing_socs) > 0: + print( + "Error: devices with SoCs are not connected %s" % missing_socs) + exit(1) + + for target_soc in target_socs: + for target_abi in configs["target_abis"]: + global_runtime = get_global_runtime(configs) + # Transfer params by environment + os.environ["TARGET_ABI"] = target_abi + model_output_dirs = [] + for model_name in configs["models"]: + print '===================', model_name, '===================' + # Transfer params by environment + os.environ["MODEL_TAG"] = model_name + model_config = configs["models"][model_name] + input_file_list = model_config.get("validation_inputs_data", + []) + for key in model_config: + if key in ['input_nodes', 'output_nodes'] and isinstance( + model_config[key], list): + os.environ[key.upper()] = ",".join(model_config[key]) + elif key in ['input_shapes', 'output_shapes' + ] and isinstance(model_config[key], list): + os.environ[key.upper()] = ":".join(model_config[key]) + else: + os.environ[key.upper()] = str(model_config[key]) + + # Create model build directory + model_path_digest = md5sum(model_config["model_file_path"]) + model_output_dir = "%s/%s/%s/%s/%s/%s/%s" % ( + FLAGS.output_dir, os.environ["PROJECT_NAME"], "build", + model_name, model_path_digest, target_soc, target_abi) + model_output_dirs.append(model_output_dir) + + if FLAGS.mode == "build" or FLAGS.mode == "all": + if os.path.exists(model_output_dir): + shutil.rmtree(model_output_dir) + os.makedirs(model_output_dir) + clear_env(target_soc) + + download_model_files(model_config["model_file_path"], + model_output_dir, + model_config.get("weight_file_path", "")) + + if FLAGS.mode == "build" or FLAGS.mode == "run" or \ + FLAGS.mode == "validate" or \ + FLAGS.mode == "benchmark" or FLAGS.mode == "all": + generate_random_input(target_soc, model_output_dir, + model_config['input_nodes'], + input_file_list) + + if FLAGS.mode == "build" or FLAGS.mode == "all": + generate_model_code() + build_mace_run_prod(model_name, global_runtime, target_abi, + target_soc, model_output_dir, + FLAGS.tuning) + + if FLAGS.mode == "run" or FLAGS.mode == "validate" or \ + FLAGS.mode == "all": + run_model(model_name, global_runtime, target_abi, + target_soc, model_output_dir, FLAGS.round, + FLAGS.restart_round, option_args) + + if FLAGS.mode == "benchmark": + benchmark_model(target_soc, model_output_dir, option_args) + + if FLAGS.mode == "validate" or FLAGS.mode == "all": + validate_model(target_soc, model_output_dir) + + if FLAGS.mode == "build" or FLAGS.mode == "merge" or \ + FLAGS.mode == "all": + merge_libs_and_tuning_results( + target_soc, + FLAGS.output_dir + "/" + os.environ["PROJECT_NAME"], + model_output_dirs) + + if FLAGS.mode == "throughput_test": + merged_lib_file = FLAGS.output_dir + \ + "/%s/%s/libmace_%s.%s.a" % \ + (os.environ["PROJECT_NAME"], target_abi, + os.environ["PROJECT_NAME"], target_soc) + generate_random_input(target_soc, FLAGS.output_dir, [], []) + for model_name in configs["models"]: + runtime = configs["models"][model_name]["runtime"] + os.environ["%s_MODEL_TAG" % runtime.upper()] = model_name + build_run_throughput_test(target_soc, FLAGS.run_seconds, + merged_lib_file, FLAGS.output_dir) + + if FLAGS.mode == "build" or FLAGS.mode == "all": + packaging_lib_file(FLAGS.output_dir) if __name__ == "__main__": - FLAGS, unparsed = parse_args() - main(unused_args=[sys.argv[0]] + unparsed) - + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 666ce7b0..8c1bd423 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -3,172 +3,205 @@ import re import time import falcon_cli + ################################ # common ################################ def strip_invalid_utf8(str): - return sh.iconv(str, "-c", "-t", "UTF-8") + return sh.iconv(str, "-c", "-t", "UTF-8") + def make_output_processor(buff): - def process_output(line): - print(line.strip()) - buff.append(line) - return process_output + def process_output(line): + print(line.strip()) + buff.append(line) + + return process_output + ################################ # adb commands ################################ def adb_split_stdout(stdout_str): - stdout_str = strip_invalid_utf8(stdout_str) - # Filter out last empty line - return [l.strip() for l in stdout_str.split('\n') if len(l.strip()) > 0] + stdout_str = strip_invalid_utf8(stdout_str) + # Filter out last empty line + return [l.strip() for l in stdout_str.split('\n') if len(l.strip()) > 0] + def adb_devices(target_socs=None): - outputs = sh.grep(sh.adb("devices"), "^[A-Za-z0-9]\+[[:space:]]\+device$") - raw_lists = sh.cut(outputs, "-f1") - device_ids = adb_split_stdout(raw_lists) - if target_socs != None: - target_socs_set = set(target_socs) - target_devices = [] - for serialno in device_ids: - props = adb_getprop_by_serialno(serialno) - if props["ro.board.platform"] in target_socs_set: - target_devices.append(serialno) - return target_devices - else: - return device_ids + outputs = sh.grep(sh.adb("devices"), "^[A-Za-z0-9]\+[[:space:]]\+device$") + raw_lists = sh.cut(outputs, "-f1") + device_ids = adb_split_stdout(raw_lists) + if target_socs is not None: + target_socs_set = set(target_socs) + target_devices = [] + for serialno in device_ids: + props = adb_getprop_by_serialno(serialno) + if props["ro.board.platform"] in target_socs_set: + target_devices.append(serialno) + return target_devices + else: + return device_ids + def adb_getprop_by_serialno(serialno): - outputs = sh.adb("-s", serialno, "shell", "getprop") - raw_props = adb_split_stdout(outputs) - props = {} - p = re.compile("\[(.+)\]: \[(.+)\]") - for raw_prop in raw_props: - m = p.match(raw_prop) - if m: - props[m.group(1)] = m.group(2) - return props + outputs = sh.adb("-s", serialno, "shell", "getprop") + raw_props = adb_split_stdout(outputs) + props = {} + p = re.compile("\[(.+)\]: \[(.+)\]") + for raw_prop in raw_props: + m = p.match(raw_prop) + if m: + props[m.group(1)] = m.group(2) + return props + def adb_supported_abis(serialno): - props = adb_getprop_by_serialno(serialno) - abilist_str = props["ro.product.cpu.abilist"] - abis = [abi.strip() for abi in abilist_str.split(',')] - return abis + props = adb_getprop_by_serialno(serialno) + abilist_str = props["ro.product.cpu.abilist"] + abis = [abi.strip() for abi in abilist_str.split(',')] + return abis + def adb_get_all_socs(): - socs = [] - for d in adb_devices(): - props = adb_getprop_by_serialno(d) - socs.append(props["ro.board.platform"]) - return set(socs) + socs = [] + for d in adb_devices(): + props = adb_getprop_by_serialno(d) + socs.append(props["ro.board.platform"]) + return set(socs) -def adb_run(serialno, host_bin_path, bin_name, + +def adb_run(serialno, + host_bin_path, + bin_name, args="", opencl_profiling=1, vlog_level=0, device_bin_path="/data/local/tmp/mace", out_of_range_check=1): - host_bin_full_path = "%s/%s" % (host_bin_path, bin_name) - device_bin_full_path = "%s/%s" % (device_bin_path, bin_name) - props = adb_getprop_by_serialno(serialno) - print("=====================================================================") - print("Run on device: %s, %s, %s" % (serialno, props["ro.board.platform"], - props["ro.product.model"])) - sh.adb("-s", serialno, "shell", "rm -rf %s" % device_bin_path) - sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path) - print("Push %s to %s" % (host_bin_full_path, device_bin_full_path)) - sh.adb("-s", serialno, "push", host_bin_full_path, device_bin_full_path) - print("Run %s" % device_bin_full_path) - stdout_buff=[] - process_output = make_output_processor(stdout_buff) - p = sh.adb("-s", serialno, "shell", - "MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" % - (out_of_range_check, opencl_profiling, vlog_level, device_bin_full_path, args), - _out=process_output, _bg=True, _err_to_out=True) - p.wait() - return "".join(stdout_buff) + host_bin_full_path = "%s/%s" % (host_bin_path, bin_name) + device_bin_full_path = "%s/%s" % (device_bin_path, bin_name) + props = adb_getprop_by_serialno(serialno) + print( + "=====================================================================" + ) + print("Run on device: %s, %s, %s" % (serialno, props["ro.board.platform"], + props["ro.product.model"])) + sh.adb("-s", serialno, "shell", "rm -rf %s" % device_bin_path) + sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path) + print("Push %s to %s" % (host_bin_full_path, device_bin_full_path)) + sh.adb("-s", serialno, "push", host_bin_full_path, device_bin_full_path) + print("Run %s" % device_bin_full_path) + stdout_buff = [] + process_output = make_output_processor(stdout_buff) + p = sh.adb( + "-s", + serialno, + "shell", + "MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d " + "MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" % + (out_of_range_check, opencl_profiling, vlog_level, + device_bin_full_path, args), + _out=process_output, + _bg=True, + _err_to_out=True) + p.wait() + return "".join(stdout_buff) ################################ # bazel commands ################################ def bazel_build(target, strip="always", abi="armeabi-v7a"): - print("Build %s with ABI %s" % (target, abi)) - stdout_buff=[] - process_output = make_output_processor(stdout_buff) - p= sh.bazel("build", - "-c", "opt", - "--strip", strip, - "--verbose_failures", - target, - "--crosstool_top=//external:android/crosstool", - "--host_crosstool_top=@bazel_tools//tools/cpp:toolchain", - "--cpu=%s" % abi, - "--copt=-std=c++11", - "--copt=-D_GLIBCXX_USE_C99_MATH_TR1", - "--copt=-DMACE_DISABLE_NO_TUNING_WARNING", - "--copt=-Werror=return-type", - "--copt=-O3", - "--define", "neon=true", - "--define", "openmp=true", - _out=process_output, _bg=True, _err_to_out=True) - p.wait() - return "".join(stdout_buff) + print("Build %s with ABI %s" % (target, abi)) + stdout_buff = [] + process_output = make_output_processor(stdout_buff) + p = sh.bazel( + "build", + "-c", + "opt", + "--strip", + strip, + "--verbose_failures", + target, + "--crosstool_top=//external:android/crosstool", + "--host_crosstool_top=@bazel_tools//tools/cpp:toolchain", + "--cpu=%s" % abi, + "--copt=-std=c++11", + "--copt=-D_GLIBCXX_USE_C99_MATH_TR1", + "--copt=-DMACE_DISABLE_NO_TUNING_WARNING", + "--copt=-Werror=return-type", + "--copt=-O3", + "--define", + "neon=true", + "--define", + "openmp=true", + _out=process_output, + _bg=True, + _err_to_out=True) + p.wait() + return "".join(stdout_buff) + def bazel_target_to_bin(target): - # change //mace/a/b:c to bazel-bin/mace/a/b/c - prefix, bin_name = target.split(':') - prefix = prefix.replace('//', '/') - if prefix.startswith('/'): - prefix = prefix[1:] - host_bin_path = "bazel-bin/%s" % prefix - return host_bin_path, bin_name + # change //mace/a/b:c to bazel-bin/mace/a/b/c + prefix, bin_name = target.split(':') + prefix = prefix.replace('//', '/') + if prefix.startswith('/'): + prefix = prefix[1:] + host_bin_path = "bazel-bin/%s" % prefix + return host_bin_path, bin_name + ################################ # mace commands ################################ # TODO this should be refactored def gen_encrypted_opencl_source(codegen_path="mace/codegen"): - sh.mkdir("-p", "%s/opencl" % codegen_path) - sh.python("mace/python/tools/encrypt_opencl_codegen.py", - "--cl_kernel_dir=./mace/kernels/opencl/cl/", - "--output_path=%s/opencl/opencl_encrypt_program.cc" % codegen_path) + sh.mkdir("-p", "%s/opencl" % codegen_path) + sh.python( + "mace/python/tools/encrypt_opencl_codegen.py", + "--cl_kernel_dir=./mace/kernels/opencl/cl/", + "--output_path=%s/opencl/opencl_encrypt_program.cc" % codegen_path) + def gen_mace_version(codegen_path="mace/codegen"): - sh.mkdir("-p", "%s/version" % codegen_path) - sh.bash("mace/tools/git/gen_version_source.sh", - "%s/version/version.cc" % codegen_path) + sh.mkdir("-p", "%s/version" % codegen_path) + sh.bash("mace/tools/git/gen_version_source.sh", + "%s/version/version.cc" % codegen_path) + def gen_compiled_opencl_source(codegen_path="mace/codegen"): - sh.mkdir("-p", "%s/opencl" % codegen_path) - sh.python("mace/python/tools/opencl_codegen.py", - "--output_path=%s/opencl/opencl_compiled_program.cc" % codegen_path) + sh.mkdir("-p", "%s/opencl" % codegen_path) + sh.python( + "mace/python/tools/opencl_codegen.py", + "--output_path=%s/opencl/opencl_compiled_program.cc" % codegen_path) + ################################ # falcon ################################ def falcon_tags(tags_dict): - tags = "" - for k, v in tags_dict.iteritems(): - if tags == "": - tags = "%s=%s" % (k, v) - else: - tags = tags + ",%s=%s" % (k, v) - return tags + tags = "" + for k, v in tags_dict.iteritems(): + if tags == "": + tags = "%s=%s" % (k, v) + else: + tags = tags + ",%s=%s" % (k, v) + return tags -def falcon_push_metrics(metrics, endpoint="mace_dev", tags={}): - cli = falcon_cli.FalconCli.connect(server="transfer.falcon.miliao.srv", - port=8433, - debug=False) - ts = int(time.time()) - falcon_metrics = [{ - "endpoint": endpoint, - "metric": key, - "tags": falcon_tags(tags), - "timestamp": ts, - "value": value, - "step": 86400, - "counterType": "GAUGE" - } for key, value in metrics.iteritems()] - cli.update(falcon_metrics) +def falcon_push_metrics(metrics, endpoint="mace_dev", tags={}): + cli = falcon_cli.FalconCli.connect( + server="transfer.falcon.miliao.srv", port=8433, debug=False) + ts = int(time.time()) + falcon_metrics = [{ + "endpoint": endpoint, + "metric": key, + "tags": falcon_tags(tags), + "timestamp": ts, + "value": value, + "step": 86400, + "counterType": "GAUGE" + } for key, value in metrics.iteritems()] + cli.update(falcon_metrics) diff --git a/tools/validate.py b/tools/validate.py index 9cf27e31..bc93d709 100644 --- a/tools/validate.py +++ b/tools/validate.py @@ -20,175 +20,172 @@ from scipy import stats # --input_shape 1,64,64,3 \ # --output_shape 1,64,64,2 + def load_data(file): - if os.path.isfile(file): - return np.fromfile(file=file, dtype=np.float32) - else: - return np.empty([0]) + if os.path.isfile(file): + return np.fromfile(file=file, dtype=np.float32) + else: + return np.empty([0]) + def format_output_name(name): - return re.sub('[^0-9a-zA-Z]+', '_', name) + return re.sub('[^0-9a-zA-Z]+', '_', name) + def compare_output(output_name, mace_out_value, out_value): - if mace_out_value.size != 0: - out_value = out_value.reshape(-1) - mace_out_value = mace_out_value.reshape(-1) - assert len(out_value) == len(mace_out_value) - similarity = (1 - spatial.distance.cosine(out_value, mace_out_value)) - print output_name, 'MACE VS', FLAGS.platform.upper(), 'similarity: ', similarity - if (FLAGS.mace_runtime == "cpu" and similarity > 0.999) or \ - (FLAGS.mace_runtime == "neon" and similarity > 0.999) or \ - (FLAGS.mace_runtime == "gpu" and similarity > 0.995) or \ - (FLAGS.mace_runtime == "dsp" and similarity > 0.930): - print '=======================Similarity Test Passed======================' + if mace_out_value.size != 0: + out_value = out_value.reshape(-1) + mace_out_value = mace_out_value.reshape(-1) + assert len(out_value) == len(mace_out_value) + similarity = (1 - spatial.distance.cosine(out_value, mace_out_value)) + print output_name, 'MACE VS', FLAGS.platform.upper( + ), 'similarity: ', similarity + if (FLAGS.mace_runtime == "cpu" and similarity > 0.999) or \ + (FLAGS.mace_runtime == "neon" and similarity > 0.999) or \ + (FLAGS.mace_runtime == "gpu" and similarity > 0.995) or \ + (FLAGS.mace_runtime == "dsp" and similarity > 0.930): + print '===================Similarity Test Passed==================' + else: + print '===================Similarity Test Failed==================' + sys.exit(-1) else: - print '=======================Similarity Test Failed======================' - sys.exit(-1) - else: - print '=======================Skip empty node===================' - sys.exit(-1) + print '=======================Skip empty node===================' + sys.exit(-1) def validate_tf_model(input_names, input_shapes, output_names): - import tensorflow as tf - if not os.path.isfile(FLAGS.model_file): - print("Input graph file '" + FLAGS.model_file + "' does not exist!") - sys.exit(-1) - - input_graph_def = tf.GraphDef() - with open(FLAGS.model_file, "rb") as f: - data = f.read() - input_graph_def.ParseFromString(data) - tf.import_graph_def(input_graph_def, name="") - - with tf.Session() as session: - with session.graph.as_default() as graph: + import tensorflow as tf + if not os.path.isfile(FLAGS.model_file): + print("Input graph file '" + FLAGS.model_file + "' does not exist!") + sys.exit(-1) + + input_graph_def = tf.GraphDef() + with open(FLAGS.model_file, "rb") as f: + data = f.read() + input_graph_def.ParseFromString(data) tf.import_graph_def(input_graph_def, name="") - input_dict = {} - for i in range(len(input_names)): - input_value = load_data(FLAGS.input_file + "_" + input_names[i]) - input_value = input_value.reshape(input_shapes[i]) - input_node = graph.get_tensor_by_name(input_names[i] + ':0') - input_dict[input_node] = input_value - - output_nodes = [] - for name in output_names: - output_nodes.extend([graph.get_tensor_by_name(name + ':0')]) - output_values = session.run(output_nodes, feed_dict=input_dict) - for i in range(len(output_names)): - output_file_name = FLAGS.mace_out_file + "_" + format_output_name(output_names[i]) - mace_out_value = load_data(output_file_name) - compare_output(output_names[i], mace_out_value, output_values[i]) - -def validate_caffe_model(input_names, input_shapes, output_names, output_shapes): - os.environ['GLOG_minloglevel'] = '1' # suprress Caffe verbose prints - import caffe - if not os.path.isfile(FLAGS.model_file): - print("Input graph file '" + FLAGS.model_file + "' does not exist!") - sys.exit(-1) - if not os.path.isfile(FLAGS.weight_file): - print("Input weight file '" + FLAGS.weight_file + "' does not exist!") - sys.exit(-1) - - caffe.set_mode_cpu() - - net = caffe.Net(FLAGS.model_file, caffe.TEST, weights=FLAGS.weight_file) - - for i in range(len(input_names)): - input_value = load_data(FLAGS.input_file + "_" + input_names[i]) - input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1, 2)) - input_blob_name = input_names[i] - try: - if input_names[i] in net.top_names: - input_blob_name = net.top_names[input_names[i]][0] - except ValueError: - pass - net.blobs[input_blob_name].data[0] = input_value - - net.forward() - - for i in range(len(output_names)): - value = net.blobs[net.top_names[output_names[i]][0]].data - out_shape = output_shapes[i] - out_shape[1], out_shape[2], out_shape[3] = out_shape[3], out_shape[1], out_shape[2] - value = value.reshape(out_shape).transpose((0, 2, 3, 1)) - output_file_name = FLAGS.mace_out_file + "_" + format_output_name(output_names[i]) - mace_out_value = load_data(output_file_name) - compare_output(output_names[i], mace_out_value, value) + + with tf.Session() as session: + with session.graph.as_default() as graph: + tf.import_graph_def(input_graph_def, name="") + input_dict = {} + for i in range(len(input_names)): + input_value = load_data( + FLAGS.input_file + "_" + input_names[i]) + input_value = input_value.reshape(input_shapes[i]) + input_node = graph.get_tensor_by_name( + input_names[i] + ':0') + input_dict[input_node] = input_value + + output_nodes = [] + for name in output_names: + output_nodes.extend( + [graph.get_tensor_by_name(name + ':0')]) + output_values = session.run(output_nodes, feed_dict=input_dict) + for i in range(len(output_names)): + output_file_name = FLAGS.mace_out_file + "_" + \ + format_output_name(output_names[i]) + mace_out_value = load_data(output_file_name) + compare_output(output_names[i], mace_out_value, + output_values[i]) + + +def validate_caffe_model(input_names, input_shapes, output_names, + output_shapes): + os.environ['GLOG_minloglevel'] = '1' # suprress Caffe verbose prints + import caffe + if not os.path.isfile(FLAGS.model_file): + print("Input graph file '" + FLAGS.model_file + "' does not exist!") + sys.exit(-1) + if not os.path.isfile(FLAGS.weight_file): + print("Input weight file '" + FLAGS.weight_file + "' does not exist!") + sys.exit(-1) + + caffe.set_mode_cpu() + + net = caffe.Net(FLAGS.model_file, caffe.TEST, weights=FLAGS.weight_file) + + for i in range(len(input_names)): + input_value = load_data(FLAGS.input_file + "_" + input_names[i]) + input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1, + 2)) + input_blob_name = input_names[i] + try: + if input_names[i] in net.top_names: + input_blob_name = net.top_names[input_names[i]][0] + except ValueError: + pass + net.blobs[input_blob_name].data[0] = input_value + + net.forward() + + for i in range(len(output_names)): + value = net.blobs[net.top_names[output_names[i]][0]].data + out_shape = output_shapes[i] + out_shape[1], out_shape[2], out_shape[3] = out_shape[3], out_shape[ + 1], out_shape[2] + value = value.reshape(out_shape).transpose((0, 2, 3, 1)) + output_file_name = FLAGS.mace_out_file + "_" + format_output_name( + output_names[i]) + mace_out_value = load_data(output_file_name) + compare_output(output_names[i], mace_out_value, value) + def main(unused_args): - input_names = [name for name in FLAGS.input_node.split(',')] - input_shape_strs = [shape for shape in FLAGS.input_shape.split(':')] - input_shapes = [[int(x) for x in shape.split(',')] for shape in input_shape_strs] - output_names = [name for name in FLAGS.output_node.split(',')] - assert len(input_names) == len(input_shapes) - - if FLAGS.platform == 'tensorflow': - validate_tf_model(input_names, input_shapes, output_names) - elif FLAGS.platform == 'caffe': - output_shape_strs = [shape for shape in FLAGS.output_shape.split(':')] - output_shapes = [[int(x) for x in shape.split(',')] for shape in output_shape_strs] - validate_caffe_model(input_names, input_shapes, output_names, output_shapes) + input_names = [name for name in FLAGS.input_node.split(',')] + input_shape_strs = [shape for shape in FLAGS.input_shape.split(':')] + input_shapes = [[int(x) for x in shape.split(',')] + for shape in input_shape_strs] + output_names = [name for name in FLAGS.output_node.split(',')] + assert len(input_names) == len(input_shapes) + + if FLAGS.platform == 'tensorflow': + validate_tf_model(input_names, input_shapes, output_names) + elif FLAGS.platform == 'caffe': + output_shape_strs = [shape for shape in FLAGS.output_shape.split(':')] + output_shapes = [[int(x) for x in shape.split(',')] + for shape in output_shape_strs] + validate_caffe_model(input_names, input_shapes, output_names, + output_shapes) + def parse_args(): - """Parses command line arguments.""" - parser = argparse.ArgumentParser() - parser.register("type", "bool", lambda v: v.lower() == "true") - parser.add_argument( - "--platform", - type=str, - default="", - help="Tensorflow or Caffe.") - parser.add_argument( - "--model_file", - type=str, - default="", - help="TensorFlow or Caffe \'GraphDef\' file to load.") - parser.add_argument( - "--weight_file", - type=str, - default="", - help="caffe model file to load.") - parser.add_argument( - "--input_file", - type=str, - default="", - help="input file.") - parser.add_argument( - "--mace_out_file", - type=str, - default="", - help="mace output file to load.") - parser.add_argument( - "--mace_runtime", - type=str, - default="gpu", - help="mace runtime device.") - parser.add_argument( - "--input_shape", - type=str, - default="1,64,64,3", - help="input shape.") - parser.add_argument( - "--output_shape", - type=str, - default="1,64,64,2", - help="output shape.") - parser.add_argument( - "--input_node", - type=str, - default="input_node", - help="input node") - parser.add_argument( - "--output_node", - type=str, - default="output_node", - help="output node") - - return parser.parse_known_args() + """Parses command line arguments.""" + parser = argparse.ArgumentParser() + parser.register("type", "bool", lambda v: v.lower() == "true") + parser.add_argument( + "--platform", type=str, default="", help="Tensorflow or Caffe.") + parser.add_argument( + "--model_file", + type=str, + default="", + help="TensorFlow or Caffe \'GraphDef\' file to load.") + parser.add_argument( + "--weight_file", + type=str, + default="", + help="caffe model file to load.") + parser.add_argument( + "--input_file", type=str, default="", help="input file.") + parser.add_argument( + "--mace_out_file", + type=str, + default="", + help="mace output file to load.") + parser.add_argument( + "--mace_runtime", type=str, default="gpu", help="mace runtime device.") + parser.add_argument( + "--input_shape", type=str, default="1,64,64,3", help="input shape.") + parser.add_argument( + "--output_shape", type=str, default="1,64,64,2", help="output shape.") + parser.add_argument( + "--input_node", type=str, default="input_node", help="input node") + parser.add_argument( + "--output_node", type=str, default="output_node", help="output node") + + return parser.parse_known_args() if __name__ == '__main__': - FLAGS, unparsed = parse_args() - main(unused_args=[sys.argv[0]] + unparsed) - + FLAGS, unparsed = parse_args() + main(unused_args=[sys.argv[0]] + unparsed) diff --git a/tools/wino_conv.py b/tools/wino_conv.py index 0dc3f8d6..fc1c90d8 100644 --- a/tools/wino_conv.py +++ b/tools/wino_conv.py @@ -11,199 +11,195 @@ G_T = {} # f(2, 3) A_T[4] = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]).astype(np.float32) A[4] = np.transpose(A_T[4]) -B_T[4] = np.array([ - [1, 0, -1, 0], - [0, 1, 1, 0], - [0, -1, 1, 0], - [0, 1, 0, -1] -]).astype(np.float32) +B_T[4] = np.array([[1, 0, -1, 0], [0, 1, 1, 0], [0, -1, 1, 0], + [0, 1, 0, -1]]).astype(np.float32) B[4] = np.transpose(B_T[4]) G[4] = np.array([ - [1, 0, 0], - [0.5, 0.5, 0.5], - [0.5, -0.5, 0.5], - [0, 0, 1], + [1, 0, 0], + [0.5, 0.5, 0.5], + [0.5, -0.5, 0.5], + [0, 0, 1], ]).astype(np.float32) G_T[4] = np.transpose(G[4]) # f(4, 3) A_T[6] = np.array([ - [1, 1, 1, 1, 1, 0], - [0, 1, -1, 2, -2, 0], - [0, 1, 1, 4, 4, 0], - [0, 1, -1, 8, -8, 1], + [1, 1, 1, 1, 1, 0], + [0, 1, -1, 2, -2, 0], + [0, 1, 1, 4, 4, 0], + [0, 1, -1, 8, -8, 1], ]).astype(np.float32) A[6] = np.transpose(A_T[6]) B_T[6] = np.array([ - [4, 0, -5, 0, 1, 0], - [0, -4, -4, 1, 1, 0], - [0, 4, -4, -1, 1, 0], - [0, -2, -1, 2, 1, 0], - [0, 2, -1, -2, 1, 0], - [0, 4, 0, -5, 0, 1], + [4, 0, -5, 0, 1, 0], + [0, -4, -4, 1, 1, 0], + [0, 4, -4, -1, 1, 0], + [0, -2, -1, 2, 1, 0], + [0, 2, -1, -2, 1, 0], + [0, 4, 0, -5, 0, 1], ]).astype(np.float32) B[6] = np.transpose(B_T[6]) G[6] = np.array([ - [1/4.0 , 0 , 0 ], - [-1/6.0, -1/6.0 , -1/6.0], - [-1/6.0, 1/6.0 , -1/6.0], - [1/24.0, 1/12.0 , 1/6.0 ], - [1/24.0, -1/12.0, 1/6.0 ], - [ 0 , 0 , 1 ], + [1 / 4.0, 0, 0], + [-1 / 6.0, -1 / 6.0, -1 / 6.0], + [-1 / 6.0, 1 / 6.0, -1 / 6.0], + [1 / 24.0, 1 / 12.0, 1 / 6.0], + [1 / 24.0, -1 / 12.0, 1 / 6.0], + [0, 0, 1], ]).astype(np.float32) G_T[6] = np.transpose(G[6]) # f(6, 3) A_T[8] = np.array([ - [1, 1, 1 , 1 , 1 , 1 , 1 , 0], - [0, 1, -1, 2 , -2 , 1/2. , -1/2. , 0], - [0, 1, 1 , 4 , 4 , 1/4. , 1/4. , 0], - [0, 1, -1, 8 , -8 , 1/8. , -1/8. , 0], - [0, 1, 1 , 16, 16 , 1/16., 1/16. , 0], - [0, 1, -1, 32, -32, 1/32., -1/32., 1], + [1, 1, 1, 1, 1, 1, 1, 0], + [0, 1, -1, 2, -2, 1 / 2., -1 / 2., 0], + [0, 1, 1, 4, 4, 1 / 4., 1 / 4., 0], + [0, 1, -1, 8, -8, 1 / 8., -1 / 8., 0], + [0, 1, 1, 16, 16, 1 / 16., 1 / 16., 0], + [0, 1, -1, 32, -32, 1 / 32., -1 / 32., 1], ]).astype(np.float32) A[8] = np.transpose(A_T[8]) B_T[8] = np.array([ - [1, 0 , -21/4., 0 , 21/4., 0 , -1, 0], - [0, 1 , 1 , -17/4., -17/4., 1 , 1 , 0], - [0, -1 , 1 , 17/4. , -17/4., -1 , 1 , 0], - [0, 1/2. , 1/4. , -5/2. , -5/4., 2 , 1 , 0], - [0, -1/2., 1/4. , 5/2. , -5/4., -2 , 1 , 0], - [0, 2 , 4 , -5/2. , -5 , 1/2. , 1 , 0], - [0, -2 , 4 , 5/2. , -5 , -1/2. , 1 , 0], - [0, -1 , 0 , 21/4. , 0 , -21/4., 0 , 1], + [1, 0, -21 / 4., 0, 21 / 4., 0, -1, 0], + [0, 1, 1, -17 / 4., -17 / 4., 1, 1, 0], + [0, -1, 1, 17 / 4., -17 / 4., -1, 1, 0], + [0, 1 / 2., 1 / 4., -5 / 2., -5 / 4., 2, 1, 0], + [0, -1 / 2., 1 / 4., 5 / 2., -5 / 4., -2, 1, 0], + [0, 2, 4, -5 / 2., -5, 1 / 2., 1, 0], + [0, -2, 4, 5 / 2., -5, -1 / 2., 1, 0], + [0, -1, 0, 21 / 4., 0, -21 / 4., 0, 1], ]).astype(np.float32) B[8] = np.transpose(B_T[8]) G[8] = np.array([ - [ 1 , 0 , 0 ], - [-2/9. , -2/9. , -2/9.], - [-2/9. , 2/9. , -2/9.], - [1/90. , 1/45. , 2/45.], - [1/90. , -1/45. , 2/45.], - [32/45., 16/45. , 8/45.], - [32/45., -16/45., 8/45.], - [ 0 , 0 , 1 ], + [1, 0, 0], + [-2 / 9., -2 / 9., -2 / 9.], + [-2 / 9., 2 / 9., -2 / 9.], + [1 / 90., 1 / 45., 2 / 45.], + [1 / 90., -1 / 45., 2 / 45.], + [32 / 45., 16 / 45., 8 / 45.], + [32 / 45., -16 / 45., 8 / 45.], + [0, 0, 1], ]).astype(np.float32) G_T[8] = np.transpose(G[8]) def output_shape(input_shape, filter_shape): - out_shape = np.zeros(4).astype(np.int32) - out_shape[0] = input_shape[0] - out_shape[1] = filter_shape[0] - out_shape[2] = input_shape[2] - 2 - out_shape[3] = input_shape[3] - 2 - return out_shape + out_shape = np.zeros(4).astype(np.int32) + out_shape[0] = input_shape[0] + out_shape[1] = filter_shape[0] + out_shape[2] = input_shape[2] - 2 + out_shape[3] = input_shape[3] - 2 + return out_shape def winograd_conv(m, r, input, filter): - alpha = m + r - 1 - print 'Winograd(m = %d, r = %d, tile size=%d' % (m, r, alpha) - alpha_square = alpha * alpha - input_shape = input.shape - filter_shape = filter.shape - out_shape = output_shape(input_shape, filter_shape) - - K = filter_shape[0] - C = input_shape[1] - U = np.zeros((K * alpha_square, C)) - - for k in range(K): - for c in range(C): - u = np.dot(np.dot(G[alpha], filter[k, c, :, :]), G_T[alpha]) - for i in range(alpha): - for j in range(alpha) : - U[(i * alpha + j) * K + k, c] = u[i, j] - - print 'filter out: ', U.shape - - rounded_h = int(math.ceil(out_shape[2] / (m * 1.0))) - rounded_w = int(math.ceil(out_shape[3] / (m * 1.0))) - P = input_shape[0] * rounded_h * rounded_w - V = np.zeros((C * alpha_square, P)) - for p in range(P): - for c in range(C): - n = p / (rounded_w * rounded_h) - t = p % (rounded_h * rounded_w) - h_idx = t / rounded_w - w_idx = t % rounded_w - h_start = h_idx * m - w_start = w_idx * m - h_end = min(h_start+alpha, input_shape[2]) - w_end = min(w_start+alpha, input_shape[3]) - d = np.zeros((alpha, alpha)) - d[0:h_end-h_start, 0:w_end-w_start] = \ - input[n, c, h_start:h_end, w_start:w_end] - v = np.dot(np.dot(B_T[alpha], d), B[alpha]) - for i in range(alpha): - for j in range(alpha): - V[(i*alpha+j)*C + c, p] = v[i, j] - - tmp = V.reshape(alpha_square, C, P, 1) - print 'input out: ', tmp.shape - tmp.astype(np.float32).tofile("C") - M = np.zeros((alpha_square * K, P)) - for i in range(alpha_square): - u = U[i * K : (i+1) * K, :] - v = V[i * C : (i+1) * C, :] - M[i * K : (i+1) * K, :] = np.dot(u, v) - - print 'M shape: ', M.shape - M.astype(np.float32).tofile("gemm") - res = np.zeros((out_shape[0], out_shape[2], out_shape[3], out_shape[1])) - for k in range(K): - for b in range(P): - tm = np.zeros((alpha, alpha)) - for i in range(alpha): - for j in range(alpha): - tm[i][j] = M[(i*alpha+j) * K + k, b] - y = np.dot(np.dot(A_T[alpha], tm), A[alpha]) - for i in range(m): - for j in range(m): - n = b / (rounded_h * rounded_w) - t = b % (rounded_h * rounded_w) - p = (t / rounded_w) * m + i - q = (t % rounded_w) * m + j - if p >= out_shape[2] or q >= out_shape[3]: - continue - res[n, p, q, k] = y[i, j] - - print 'Res shape: ', res.shape - res.astype(np.float32).tofile("res") - - return res + alpha = m + r - 1 + print 'Winograd(m = %d, r = %d, tile size=%d' % (m, r, alpha) + alpha_square = alpha * alpha + input_shape = input.shape + filter_shape = filter.shape + out_shape = output_shape(input_shape, filter_shape) + + K = filter_shape[0] + C = input_shape[1] + U = np.zeros((K * alpha_square, C)) + + for k in range(K): + for c in range(C): + u = np.dot(np.dot(G[alpha], filter[k, c, :, :]), G_T[alpha]) + for i in range(alpha): + for j in range(alpha): + U[(i * alpha + j) * K + k, c] = u[i, j] + + print 'filter out: ', U.shape + + rounded_h = int(math.ceil(out_shape[2] / (m * 1.0))) + rounded_w = int(math.ceil(out_shape[3] / (m * 1.0))) + P = input_shape[0] * rounded_h * rounded_w + V = np.zeros((C * alpha_square, P)) + for p in range(P): + for c in range(C): + n = p / (rounded_w * rounded_h) + t = p % (rounded_h * rounded_w) + h_idx = t / rounded_w + w_idx = t % rounded_w + h_start = h_idx * m + w_start = w_idx * m + h_end = min(h_start + alpha, input_shape[2]) + w_end = min(w_start + alpha, input_shape[3]) + d = np.zeros((alpha, alpha)) + d[0:h_end-h_start, 0:w_end-w_start] = \ + input[n, c, h_start:h_end, w_start:w_end] + v = np.dot(np.dot(B_T[alpha], d), B[alpha]) + for i in range(alpha): + for j in range(alpha): + V[(i * alpha + j) * C + c, p] = v[i, j] + + tmp = V.reshape(alpha_square, C, P, 1) + print 'input out: ', tmp.shape + tmp.astype(np.float32).tofile("C") + M = np.zeros((alpha_square * K, P)) + for i in range(alpha_square): + u = U[i * K:(i + 1) * K, :] + v = V[i * C:(i + 1) * C, :] + M[i * K:(i + 1) * K, :] = np.dot(u, v) + + print 'M shape: ', M.shape + M.astype(np.float32).tofile("gemm") + res = np.zeros((out_shape[0], out_shape[2], out_shape[3], out_shape[1])) + for k in range(K): + for b in range(P): + tm = np.zeros((alpha, alpha)) + for i in range(alpha): + for j in range(alpha): + tm[i][j] = M[(i * alpha + j) * K + k, b] + y = np.dot(np.dot(A_T[alpha], tm), A[alpha]) + for i in range(m): + for j in range(m): + n = b / (rounded_h * rounded_w) + t = b % (rounded_h * rounded_w) + p = (t / rounded_w) * m + i + q = (t % rounded_w) * m + j + if p >= out_shape[2] or q >= out_shape[3]: + continue + res[n, p, q, k] = y[i, j] + + print 'Res shape: ', res.shape + res.astype(np.float32).tofile("res") + + return res + def tf_conv(input, filter): - conv_op = tf.nn.conv2d(input, filter, [1, 1, 1, 1], 'VALID') - with tf.Session() as sess: - res = sess.run(conv_op) - return res + conv_op = tf.nn.conv2d(input, filter, [1, 1, 1, 1], 'VALID') + with tf.Session() as sess: + res = sess.run(conv_op) + return res def main(): - input = np.random.random([5, 23, 29, 15]).astype(np.float32) - # input = np.fromfile(file="A", dtype=np.float32) - # input = input.reshape(1, 3, 3, 5) - print 'input shape: ', input.shape - # input.tofile("A") - filter = np.random.random([3, 3, 15, 13]).astype(np.float32) - tf_out = tf_conv(input, filter) - input = input.transpose((0, 3, 1, 2)) - filter = filter.transpose((3, 2, 0, 1)) - print 'filter shape: ', filter.shape - # filter.tofile("filter_in") - for i in [2, 4, 6]: - print "==========f(%d,3)==========" % i - winograd_out = winograd_conv(i, 3, input, filter) - res = np.allclose(tf_out, winograd_out) - if res: - print "=========Pass=========" - else: - print "=========Failed=======" - print "TF: ", tf_out - print "Winograd: ", winograd_out + input = np.random.random([5, 23, 29, 15]).astype(np.float32) + # input = np.fromfile(file="A", dtype=np.float32) + # input = input.reshape(1, 3, 3, 5) + print 'input shape: ', input.shape + # input.tofile("A") + filter = np.random.random([3, 3, 15, 13]).astype(np.float32) + tf_out = tf_conv(input, filter) + input = input.transpose((0, 3, 1, 2)) + filter = filter.transpose((3, 2, 0, 1)) + print 'filter shape: ', filter.shape + # filter.tofile("filter_in") + for i in [2, 4, 6]: + print "==========f(%d,3)==========" % i + winograd_out = winograd_conv(i, 3, input, filter) + res = np.allclose(tf_out, winograd_out) + if res: + print "=========Pass=========" + else: + print "=========Failed=======" + print "TF: ", tf_out + print "Winograd: ", winograd_out if __name__ == '__main__': - main() - + main() -- GitLab