提交 6da30d22 编写于 作者: L Liangliang He

Enable python style check

上级 e54825c5
stages: stages:
- cpplint - cpplint
- pycodestyle
- ops_test - ops_test
- ops_benchmark - ops_benchmark
...@@ -7,7 +8,12 @@ cpplint: ...@@ -7,7 +8,12 @@ cpplint:
stage: cpplint stage: cpplint
script: script:
- curl -o cpplint.py https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py - curl -o cpplint.py https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py
- python cpplint.py --linelength=80 --counting=detailed $(find mace -name *.h -or -name *.cc) - python cpplint.py --linelength=80 --counting=detailed $(find mace -name "*.h" -or -name "*.cc")
pycodestyle:
stage: pycodestyle
script:
- pycodestyle $(find -name "*.py")
ops_test: ops_test:
stage: ops_test stage: ops_test
......
...@@ -113,7 +113,8 @@ RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com ...@@ -113,7 +113,8 @@ RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
scipy \ scipy \
jinja2 \ jinja2 \
pyyaml \ pyyaml \
sh sh \
pycodestyle
# Download tensorflow tools # Download tensorflow tools
RUN wget http://cnbj1-inner-fds.api.xiaomi.net/mace/tool/transform_graph && \ RUN wget http://cnbj1-inner-fds.api.xiaomi.net/mace/tool/transform_graph && \
......
...@@ -16,74 +16,75 @@ FLAGS = None ...@@ -16,74 +16,75 @@ FLAGS = None
def generate_cpp_source(): def generate_cpp_source():
data_map = {} data_map = {}
for binary_dir in FLAGS.binary_dirs.split(","): for binary_dir in FLAGS.binary_dirs.split(","):
binary_path = os.path.join(binary_dir, FLAGS.binary_file_name) binary_path = os.path.join(binary_dir, FLAGS.binary_file_name)
if not os.path.exists(binary_path): if not os.path.exists(binary_path):
continue continue
with open(binary_path, "rb") as f: with open(binary_path, "rb") as f:
binary_array = np.fromfile(f, dtype=np.uint8) binary_array = np.fromfile(f, dtype=np.uint8)
print "Generate binary from", binary_path print "Generate binary from", binary_path
idx = 0 idx = 0
size, = struct.unpack("Q", binary_array[idx:idx+8]) size, = struct.unpack("Q", binary_array[idx:idx + 8])
idx += 8 idx += 8
for _ in xrange(size): for _ in xrange(size):
key_size, = struct.unpack("i", binary_array[idx:idx+4]) key_size, = struct.unpack("i", binary_array[idx:idx + 4])
idx += 4 idx += 4
key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size]) key, = struct.unpack(
idx += key_size str(key_size) + "s", binary_array[idx:idx + key_size])
params_size, = struct.unpack("i", binary_array[idx:idx+4]) idx += key_size
idx += 4 params_size, = struct.unpack("i", binary_array[idx:idx + 4])
data_map[key] = [] idx += 4
count = params_size / 4 data_map[key] = []
params = struct.unpack(str(count) + "i", binary_array[idx:idx+params_size]) count = params_size / 4
for i in params: params = struct.unpack(
data_map[key].append(i) str(count) + "i", binary_array[idx:idx + params_size])
idx += params_size for i in params:
data_map[key].append(i)
idx += params_size
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
return env.get_template('str2vec_maps.cc.jinja2').render(
maps=data_map,
data_type='unsigned int',
variable_name=FLAGS.variable_name)
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
return env.get_template('str2vec_maps.cc.jinja2').render(
maps = data_map,
data_type = 'unsigned int',
variable_name = FLAGS.variable_name
)
def main(unused_args): def main(unused_args):
cpp_binary_source = generate_cpp_source() cpp_binary_source = generate_cpp_source()
if os.path.isfile(FLAGS.output_path): if os.path.isfile(FLAGS.output_path):
os.remove(FLAGS.output_path) os.remove(FLAGS.output_path)
w_file = open(FLAGS.output_path, "w") w_file = open(FLAGS.output_path, "w")
w_file.write(cpp_binary_source) w_file.write(cpp_binary_source)
w_file.close() w_file.close()
def parse_args(): def parse_args():
"""Parses command line arguments.""" """Parses command line arguments."""
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
"--binary_dirs", "--binary_dirs", type=str, default="", help="The binaries file path.")
type=str, parser.add_argument(
default="", "--binary_file_name",
help="The binaries file path.") type=str,
parser.add_argument( default="mace_run.config",
"--binary_file_name", help="The binary file name.")
type=str, parser.add_argument(
default="mace_run.config", "--output_path",
help="The binary file name.") type=str,
parser.add_argument( default="",
"--output_path", help="The path of generated C++ source file which contains the binary."
type=str, )
default="", parser.add_argument(
help="The path of generated C++ source file which contains the binary.") "--variable_name",
parser.add_argument( type=str,
"--variable_name", default="kTuningParamsData",
type=str, help="global variable name.")
default="kTuningParamsData", return parser.parse_known_args()
help="global variable name.")
return parser.parse_known_args()
if __name__ == '__main__': if __name__ == '__main__':
FLAGS, unparsed = parse_args() FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed) main(unused_args=[sys.argv[0]] + unparsed)
...@@ -5,32 +5,26 @@ import google.protobuf.text_format ...@@ -5,32 +5,26 @@ import google.protobuf.text_format
import numpy as np import numpy as np
import math import math
pooling_type_mode = { pooling_type_mode = {'AvgPool': 1, 'MaxPool': 2}
'AvgPool': 1,
'MaxPool': 2
}
buffer_type_map = { buffer_type_map = {
'CONV2D_FILTER' : 0, 'CONV2D_FILTER': 0,
'IN_OUT_CHANNEL' : 1, 'IN_OUT_CHANNEL': 1,
'ARGUMENT' : 2, 'ARGUMENT': 2,
'IN_OUT_HEIGHT' : 3, 'IN_OUT_HEIGHT': 3,
'IN_OUT_WIDTH' : 4, 'IN_OUT_WIDTH': 4,
'WINOGRAD_FILTER' : 5, 'WINOGRAD_FILTER': 5,
'DW_CONV2D_FILTER' : 6, 'DW_CONV2D_FILTER': 6,
'WEIGHT_HEIGHT' : 7, 'WEIGHT_HEIGHT': 7,
'WEIGHT_WIDTH' : 8, 'WEIGHT_WIDTH': 8,
} }
data_type_map = { data_type_map = {'DT_HALF': mace_pb2.DT_HALF, 'DT_FLOAT': mace_pb2.DT_FLOAT}
'DT_HALF' : mace_pb2.DT_HALF,
'DT_FLOAT': mace_pb2.DT_FLOAT
}
activation_name_map = { activation_name_map = {
'ReLU' : 'RELU', 'ReLU': 'RELU',
'Sigmoid' : 'SIGMOID', 'Sigmoid': 'SIGMOID',
'TanH' : 'TANH', 'TanH': 'TANH',
} }
MACE_INPUT_NODE_NAME = "mace_input_node" MACE_INPUT_NODE_NAME = "mace_input_node"
...@@ -38,1022 +32,1102 @@ MACE_OUTPUT_NODE_NAME = "mace_output_node" ...@@ -38,1022 +32,1102 @@ MACE_OUTPUT_NODE_NAME = "mace_output_node"
OPENCL_IMAGE_MAX_SIZE = 16384 OPENCL_IMAGE_MAX_SIZE = 16384
class Operator(object): class Operator(object):
def __init__(self, name, type, layer): def __init__(self, name, type, layer):
self.name = name self.name = name
self.type = type self.type = type
self.layer = layer self.layer = layer
self.parents = [] self.parents = []
self.children = [] self.children = []
self.data = [] self.data = []
self.output_shape_map = {} self.output_shape_map = {}
def add_parent(self, parent_op): def add_parent(self, parent_op):
self.parents.append(parent_op) self.parents.append(parent_op)
parent_op.children.append(self) parent_op.children.append(self)
def get_single_parent(self): def get_single_parent(self):
if len(self.parents) != 1: if len(self.parents) != 1:
raise Exception('Operation %s expected single parent, but got %s' raise Exception('Operation %s expected single parent, but got %s' %
% (self.name, len(self.parents))) (self.name, len(self.parents)))
return self.parents[0] return self.parents[0]
def BlobToNPArray(blob): def BlobToNPArray(blob):
if blob.num != 0: if blob.num != 0:
return (np.asarray(blob.data, dtype=np.float32). return (np.asarray(blob.data, dtype=np.float32).reshape(
reshape((blob.num, blob.channels, blob.height, blob.width))) (blob.num, blob.channels, blob.height, blob.width)))
else: else:
return np.asarray(blob.data, dtype=np.float32).reshape(blob.shape.dim) return np.asarray(blob.data, dtype=np.float32).reshape(blob.shape.dim)
class Shapes(object): class Shapes(object):
@staticmethod @staticmethod
def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func, input_format='NHWC'): def conv_pool_shape(input_shape,
output_shape = np.zeros_like(input_shape) filter_shape,
output_shape[0] = input_shape[0] paddings,
if input_format == 'NHWC': strides,
# input format: NHWC, filter format: HWOI dilations,
output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0] round_func,
- (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1 input_format='NHWC'):
output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1] output_shape = np.zeros_like(input_shape)
- (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1 output_shape[0] = input_shape[0]
output_shape[3] = filter_shape[2] if input_format == 'NHWC':
elif input_format == 'NCHW': # input format: NHWC, filter format: HWOI
# input format: NCHW, filter format: OIHW output_shape[1] = int(
output_shape[1] = filter_shape[0] round_func((input_shape[1] + paddings[0] - filter_shape[0] -
output_shape[2] = int(round_func((input_shape[2] + paddings[0] - filter_shape[2] (filter_shape[0] - 1) *
- (filter_shape[2] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1 (dilations[0] - 1)) / float(strides[0]))) + 1
output_shape[3] = int(round_func((input_shape[3] + paddings[1] - filter_shape[3] output_shape[2] = int(
- (filter_shape[3] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1 round_func((input_shape[2] + paddings[1] - filter_shape[1] -
else: (filter_shape[1] - 1) *
raise Exception("format %s is not supported" % input_format) (dilations[1] - 1)) / float(strides[1]))) + 1
output_shape[3] = filter_shape[2]
return output_shape elif input_format == 'NCHW':
# input format: NCHW, filter format: OIHW
@staticmethod output_shape[1] = filter_shape[0]
def fully_connected_shape(input_shape, weight_shape): output_shape[2] = int(
return [input_shape[0], 1, 1, weight_shape[0]] round_func((input_shape[2] + paddings[0] - filter_shape[2] -
(filter_shape[2] - 1) *
@staticmethod (dilations[0] - 1)) / float(strides[0]))) + 1
def concat_shape(input_shapes, axis): output_shape[3] = int(
output_shape = None round_func((input_shape[3] + paddings[1] - filter_shape[3] -
for input_shape in input_shapes: (filter_shape[3] - 1) *
if output_shape is None: (dilations[1] - 1)) / float(strides[1]))) + 1
output_shape = list(input_shape) else:
else: raise Exception("format %s is not supported" % input_format)
output_shape[axis] += input_shape[axis]
return output_shape return output_shape
@staticmethod @staticmethod
def slice_shape(input_shape, num_output, input_format='NHWC'): def fully_connected_shape(input_shape, weight_shape):
if input_format == 'NHWC': return [input_shape[0], 1, 1, weight_shape[0]]
return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output]
elif input_format == 'NCHW': @staticmethod
return [input_shape[0], input_shape[1]/num_output, input_shape[2], input_shape[3]] def concat_shape(input_shapes, axis):
else: output_shape = None
raise Exception("format %s is not supported" % input_format) for input_shape in input_shapes:
if output_shape is None:
output_shape = list(input_shape)
else:
output_shape[axis] += input_shape[axis]
return output_shape
@staticmethod
def slice_shape(input_shape, num_output, input_format='NHWC'):
if input_format == 'NHWC':
return [
input_shape[0], input_shape[1], input_shape[2],
input_shape[3] / num_output
]
elif input_format == 'NCHW':
return [
input_shape[0], input_shape[1] / num_output, input_shape[2],
input_shape[3]
]
else:
raise Exception("format %s is not supported" % input_format)
# outputs' name is [op.name + '_' + #] # outputs' name is [op.name + '_' + #]
class CaffeConverter(object): class CaffeConverter(object):
def __init__(self, caffe_net, weights, net_def, dt, device, winograd): def __init__(self, caffe_net, weights, net_def, dt, device, winograd):
self.net_def = net_def self.net_def = net_def
self.caffe_net = caffe_net self.caffe_net = caffe_net
self.weights = weights self.weights = weights
self.dt = dt self.dt = dt
self.device = device self.device = device
self.winograd = winograd self.winograd = winograd
self.resolved_ops = set() self.resolved_ops = set()
self.ops = [] self.ops = []
self.inputs_map = {} # caffe op name -> mace inputs' name self.inputs_map = {} # caffe op name -> mace inputs' name
# Add Input operations # Add Input operations
top_name_map = {} top_name_map = {}
inputs = caffe_net.input inputs = caffe_net.input
for input in inputs: for input in inputs:
self.ops.extend([Operator(input, 'Input', None)]) self.ops.extend([Operator(input, 'Input', None)])
top_name_map[input] = input top_name_map[input] = input
layers = caffe_net.layer layers = caffe_net.layer
# remove train layers and dropout # remove train layers and dropout
layers = self.remove_unused_layers(layers) layers = self.remove_unused_layers(layers)
# Construct graph # Construct graph
# Only support single-output layer # Only support single-output layer
# layer with single output often use the same top name. # layer with single output often use the same top name.
self.ops.extend([Operator(layer.name, layer.type, layer) for layer in layers]) self.ops.extend(
[Operator(layer.name, layer.type, layer) for layer in layers])
self.ops_map = {op.name : op for op in self.ops}
output_op_map = {} self.ops_map = {op.name: op for op in self.ops}
for layer in layers: output_op_map = {}
op = self.ops_map[layer.name] for layer in layers:
for input_name in layer.bottom: op = self.ops_map[layer.name]
assert input_name != layer.name for input_name in layer.bottom:
parent_op = output_op_map.get(input_name) assert input_name != layer.name
if parent_op is None: parent_op = output_op_map.get(input_name)
parent_op = self.ops_map[input_name] if parent_op is None:
op.add_parent(parent_op) parent_op = self.ops_map[input_name]
if op.name not in self.inputs_map: op.add_parent(parent_op)
self.inputs_map[op.name] = [] if op.name not in self.inputs_map:
self.inputs_map[op.name].extend([top_name_map[input_name]]) self.inputs_map[op.name] = []
for i in range(len(layer.top)): self.inputs_map[op.name].extend([top_name_map[input_name]])
output_name = layer.top[i] for i in range(len(layer.top)):
if len(layer.top) == 1: output_name = layer.top[i]
top_name_map[output_name] = op.name if len(layer.top) == 1:
top_name_map[output_name] = op.name
else:
top_name_map[output_name] = op.name + '_' + str(i)
if output_name == layer.name:
continue
output_op_map[output_name] = op
# Load weights
weights_layers = weights.layer
for layer in weights_layers:
if not layer.blobs:
continue
if layer.name in self.ops_map:
op = self.ops_map[layer.name]
op.data = [BlobToNPArray(blob) for blob in layer.blobs]
# toposort ops
self.ops = self.toposort_ops()
def CommonConvert(self, op, mace_type):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else: else:
top_name_map[output_name] = op.name + '_' + str(i) data_format_arg.s = 'NHWC'
if output_name == layer.name: op_def.name = op.name
continue op_def.type = mace_type
output_op_map[output_name] = op op_def.input.extend([name + ':0' for name in self.inputs_map[op.name]])
return op_def
# Load weights def remove_unused_layers(self, layers):
weights_layers = weights.layer phase_map = {0: 'train', 1: 'test'}
for layer in weights_layers: test_layers_names = set()
if not layer.blobs: test_layers = []
continue for layer in layers:
if layer.name in self.ops_map: phase = 'test'
op = self.ops_map[layer.name] if len(layer.include):
op.data = [BlobToNPArray(blob) for blob in layer.blobs] phase = phase_map[layer.include[0].phase]
if len(layer.exclude):
# toposort ops phase = phase_map[layer.exclude[0].phase]
self.ops = self.toposort_ops() if phase == 'test' and layer.type != 'Dropout':
test_layers.append(layer)
def CommonConvert(self, op, mace_type): assert layer.name not in test_layers_names
op_def = mace_pb2.OperatorDef() test_layers_names.add(layer.name)
arg = op_def.arg.add() return test_layers
arg.name = 'T'
arg.i = self.dt def toposort_ops(self):
data_format_arg = op_def.arg.add() sorted_ops = []
data_format_arg.name = 'data_format' temp_visited = set()
if self.device == 'neon': visited = set()
data_format_arg.s = 'NCHW'
else: def search(op):
data_format_arg.s = 'NHWC' if op.name in temp_visited:
op_def.name = op.name raise Exception("The model is not DAG")
op_def.type = mace_type if op.name in visited:
op_def.input.extend([name+':0' for name in self.inputs_map[op.name]]) return
return op_def temp_visited.add(op.name)
for parent_op in op.parents:
def remove_unused_layers(self, layers): search(parent_op)
phase_map = {0: 'train', 1: 'test'} temp_visited.remove(op.name)
test_layers_names = set() sorted_ops.append(op)
test_layers = [] visited.add(op.name)
for layer in layers:
phase = 'test' for op in self.ops:
if len(layer.include): search(op)
phase = phase_map[layer.include[0].phase]
if len(layer.exclude): return sorted_ops
phase = phase_map[layer.exclude[0].phase]
if phase == 'test' and layer.type != 'Dropout': def add_buffer_to_image(self, input_name, input_type):
test_layers.append(layer) output_name = input_name[:-2] + "_b2i" + input_name[-2:]
assert layer.name not in test_layers_names op_def = self.net_def.op.add()
test_layers_names.add(layer.name) op_def.name = output_name[:-2]
return test_layers op_def.type = 'BufferToImage'
op_def.input.extend([input_name])
def toposort_ops(self): op_def.output.extend([output_name])
sorted_ops = []
temp_visited = set() arg = op_def.arg.add()
visited = set() arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
def search(op): arg = op_def.arg.add()
if op.name in temp_visited: arg.name = 'mode'
raise Exception("The model is not DAG") arg.i = 0
if op.name in visited: arg = op_def.arg.add()
return arg.name = 'T'
temp_visited.add(op.name) arg.i = self.dt
for parent_op in op.parents: return output_name
search(parent_op)
temp_visited.remove(op.name) def add_image_to_buffer(self, input_name, input_type):
sorted_ops.append(op) output_name = input_name[:-2] + "_i2b" + input_name[-2:]
visited.add(op.name) op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
for op in self.ops: op_def.type = 'ImageToBuffer'
search(op) op_def.input.extend([input_name])
op_def.output.extend([output_name])
return sorted_ops
arg = op_def.arg.add()
def add_buffer_to_image(self, input_name, input_type): arg.name = 'buffer_type'
output_name = input_name[:-2] + "_b2i" + input_name[-2:] arg.i = buffer_type_map[input_type]
op_def = self.net_def.op.add() arg = op_def.arg.add()
op_def.name = output_name[:-2] arg.name = 'T'
op_def.type = 'BufferToImage' arg.i = self.dt
op_def.input.extend([input_name]) return output_name
op_def.output.extend([output_name])
def add_input_transform(self, names):
arg = op_def.arg.add() for name in names:
arg.name = 'buffer_type' new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
arg.i = buffer_type_map[input_type] op_def = self.net_def.op.add()
arg = op_def.arg.add() op_def.name = name
arg.name = 'mode' op_def.type = 'BufferToImage'
arg.i = 0 op_def.input.extend([new_input_name])
arg = op_def.arg.add() op_def.output.extend([name + ':0'])
arg.name = 'T'
arg.i = self.dt epsilon_arg = op_def.arg.add()
return output_name epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
def add_image_to_buffer(self, input_name, input_type):
output_name = input_name[:-2] + "_i2b" + input_name[-2:] arg = op_def.arg.add()
op_def = self.net_def.op.add() arg.name = 'T'
op_def.name = output_name[:-2] arg.i = self.dt
op_def.type = 'ImageToBuffer'
op_def.input.extend([input_name]) def add_output_transform(self, names):
op_def.output.extend([output_name]) for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
arg = op_def.arg.add() op_def = self.net_def.op.add()
arg.name = 'buffer_type' op_def.name = output_name[:-2]
arg.i = buffer_type_map[input_type] op_def.type = 'ImageToBuffer'
arg = op_def.arg.add() op_def.input.extend([name + ':0'])
arg.name = 'T' op_def.output.extend([output_name])
arg.i = self.dt
return output_name epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
def add_input_transform(self, names): epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" def add_tensor(self, name, value):
op_def = self.net_def.op.add() tensor = self.net_def.tensors.add()
op_def.name = name tensor.name = name
op_def.type = 'BufferToImage'
op_def.input.extend([new_input_name]) shape = list(value.shape)
op_def.output.extend([name+':0']) tensor.dims.extend(shape)
epsilon_arg = op_def.arg.add() tensor.data_type = mace_pb2.DT_FLOAT
epsilon_arg.name = 'buffer_type' tensor.float_data.extend(value.flat)
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
@staticmethod
arg = op_def.arg.add() def add_output_shape(op_def, output_shape):
arg.name = 'T' mace_output_shape = mace_pb2.OutputShape()
arg.i = self.dt mace_output_shape.dims.extend(output_shape)
op_def.output_shape.extend([mace_output_shape])
def add_output_transform(self, names):
for name in names: def add_stride_pad_kernel_arg(self, param, op_def):
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" try:
op_def = self.net_def.op.add() if len(param.stride) > 1 or len(param.kernel_size) > 1 or len(
op_def.name = output_name[:-2] param.pad) > 1:
op_def.type = 'ImageToBuffer' raise Exception(
op_def.input.extend([name+':0']) 'Mace does not support multiple stride/kernel_size/pad')
op_def.output.extend([output_name]) stride = [param.stride[0],
param.stride[0]] if len(param.stride) else [1, 1]
epsilon_arg = op_def.arg.add() pad = [param.pad[0] * 2,
epsilon_arg.name = 'buffer_type' param.pad[0] * 2] if len(param.pad) else [0, 0]
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] kernel = [param.kernel_size[0], param.kernel_size[0]] if len(
param.kernel_size) else [0, 0]
def add_tensor(self, name, value): except TypeError:
tensor = self.net_def.tensors.add() stride = [param.stride, param.stride]
tensor.name = name pad = [param.pad * 2, param.pad * 2]
kernel = [param.kernel_size, param.kernel_size]
shape = list(value.shape)
tensor.dims.extend(shape) if param.HasField("stride_h") or param.HasField("stride_w"):
stride = [param.stride_h, param.stride_w]
tensor.data_type = mace_pb2.DT_FLOAT # Pad
tensor.float_data.extend(value.flat) if param.HasField("pad_h") or param.HasField("pad_w"):
pad = [param.pad_h * 2, param.pad_w * 2]
@staticmethod
def add_output_shape(op_def, output_shape): if op_def is not None:
mace_output_shape = mace_pb2.OutputShape() strides_arg = op_def.arg.add()
mace_output_shape.dims.extend(output_shape) strides_arg.name = 'strides'
op_def.output_shape.extend([mace_output_shape]) strides_arg.ints.extend(stride)
def add_stride_pad_kernel_arg(self, param, op_def): padding_arg = op_def.arg.add()
try: padding_arg.name = 'padding_values'
if len(param.stride) > 1 or len(param.kernel_size) > 1 or len(param.pad) > 1: padding_arg.ints.extend(pad)
raise Exception('Mace does not support multiple stride/kernel_size/pad')
stride = [param.stride[0], param.stride[0]] if len(param.stride) else [1, 1] if op_def.type == 'Pooling':
pad = [param.pad[0] * 2, param.pad[0] * 2] if len(param.pad) else [0, 0] if param.HasField("kernel_h") or param.HasField("kernel_w"):
kernel = [param.kernel_size[0], param.kernel_size[0]] if len(param.kernel_size) else [0, 0] kernel = [param.kernel_h, param.kernel_w]
except TypeError:
stride = [param.stride, param.stride] return pad, stride, kernel
pad = [param.pad * 2, param.pad * 2]
kernel = [param.kernel_size, param.kernel_size] def convert_conv2d(self, op):
param = op.layer.convolution_param
if param.HasField("stride_h") or param.HasField("stride_w"): is_depthwise = False
stride = [param.stride_h, param.stride_w] if param.HasField('group'):
# Pad if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1:
if param.HasField("pad_h") or param.HasField("pad_w"): is_depthwise = True
pad = [param.pad_h * 2, param.pad_w * 2] else:
raise Exception("Mace do not support group convolution yet")
if op_def is not None:
strides_arg = op_def.arg.add() if is_depthwise:
strides_arg.name = 'strides' op_def = self.CommonConvert(op, 'DepthwiseConv2d')
strides_arg.ints.extend(stride) else:
op_def = self.CommonConvert(op, 'Conv2D')
padding_arg = op_def.arg.add()
padding_arg.name = 'padding_values'
padding_arg.ints.extend(pad)
if op_def.type == 'Pooling':
if param.HasField("kernel_h") or param.HasField("kernel_w"):
kernel = [param.kernel_h, param.kernel_w]
return pad, stride, kernel
def convert_conv2d(self, op):
param = op.layer.convolution_param
is_depthwise = False
if param.HasField('group'):
if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1:
is_depthwise = True
else:
raise Exception("Mace do not support group convolution yet")
if is_depthwise:
op_def = self.CommonConvert(op, 'DepthwiseConv2d')
else:
op_def = self.CommonConvert(op, 'Conv2D')
# Add filter # Add filter
weight_tensor_name = op.name + '_weight:0' weight_tensor_name = op.name + '_weight:0'
if self.device == 'neon': if self.device == 'neon':
weight_data = op.data[0] weight_data = op.data[0]
else: else:
# OIHW -> HWOI # OIHW -> HWOI
weight_data = op.data[0].transpose((2, 3, 0, 1)) weight_data = op.data[0].transpose((2, 3, 0, 1))
self.add_tensor(weight_tensor_name, weight_data) self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'gpu': if self.device == 'gpu':
buffer_type = "DW_CONV2D_FILTER" if is_depthwise else "CONV2D_FILTER" buffer_type = "DW_CONV2D_FILTER" \
output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type) if is_depthwise else "CONV2D_FILTER"
op_def.input.extend([output_name]) output_name = self.add_buffer_to_image(weight_tensor_name,
else: buffer_type)
op_def.input.extend([weight_tensor_name]) op_def.input.extend([output_name])
else:
# Add Bias op_def.input.extend([weight_tensor_name])
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0' # Add Bias
bias_data = op.data[1].reshape(-1) if len(op.data) == 2:
self.add_tensor(bias_tensor_name, bias_data) bias_tensor_name = op.name + '_bias:0'
if self.device == 'gpu': bias_data = op.data[1].reshape(-1)
output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT") self.add_tensor(bias_tensor_name, bias_data)
op_def.input.extend([output_name]) if self.device == 'gpu':
else: output_name = self.add_buffer_to_image(bias_tensor_name,
op_def.input.extend([bias_tensor_name]) "ARGUMENT")
op_def.input.extend([output_name])
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, op_def) else:
dilations = [1, 1] op_def.input.extend([bias_tensor_name])
if len(param.dilation) > 0:
dilation_arg = op_def.arg.add() paddings, strides, _ = self.add_stride_pad_kernel_arg(param, op_def)
dilation_arg.name = 'dilations' dilations = [1, 1]
if len(param.dilation) == 1: if len(param.dilation) > 0:
dilations = [param.dilation[0], param.dilation[0]] dilation_arg = op_def.arg.add()
elif len(param.dilation) == 2: dilation_arg.name = 'dilations'
dilations = [param.dilation[0], param.dilation[1]] if len(param.dilation) == 1:
dilation_arg.ints.extend(dilations) dilations = [param.dilation[0], param.dilation[0]]
final_op = op elif len(param.dilation) == 2:
self.resolved_ops.add(op.name) dilations = [param.dilation[0], param.dilation[1]]
dilation_arg.ints.extend(dilations)
input_format = 'NCHW' if self.device == 'neon' else 'NHWC' final_op = op
output_shape = Shapes.conv_pool_shape(op.get_single_parent().output_shape_map[op.layer.bottom[0]], self.resolved_ops.add(op.name)
weight_data.shape,
paddings, strides, dilations,
math.floor, input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
if not is_depthwise:
op_def.type = "FusedConv2D"
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name+':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def check_winograd_conv(self, op):
# TODO: support winograd conv on neon
if self.device == 'neon':
return False
param = op.layer.convolution_param
filter_shape = np.asarray(op.data[0].shape)
if self.device != 'neon':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
dilations = [1, 1]
if len(param.dilation) > 0:
if len(param.dilation) == 1:
dilations = [param.dilation[0], param.dilation[0]]
elif len(param.dilation) == 2:
dilations = [param.dilation[0], param.dilation[1]]
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, dilations, math.floor, input_format)
width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
if self.winograd and dilations[0] == 1 and (dilations[0] == dilations[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]):
if self.device == 'gpu':
return filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
elif self.device == 'neon':
return filter_shape[2] == 3 and (filter_shape[2] == filter_shape[3])
return False
def convert_winograd_conv(self, op):
# Add filter
weight_tensor_name = op.name + '_weight:0'
self.add_tensor(weight_tensor_name, op.data[0])
buffer_type = "WINOGRAD_FILTER"
filter_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
param = op.layer.convolution_param
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
filter_shape = np.asarray(op.data[0].shape)
if self.device != 'neon':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, [1, 1], math.floor, input_format)
# Input transform
wt_op = mace_pb2.OperatorDef()
arg = wt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
padding_arg = wt_op.arg.add()
padding_arg.name = 'padding_values'
padding_arg.ints.extend(paddings)
wt_op.name = op.name + '_input_transform'
wt_op.type = 'WinogradTransform'
wt_op.input.extend([name+':0' for name in self.inputs_map[op.name]])
wt_output_name = wt_op.name + ":0"
wt_op.output.extend([wt_output_name])
wt_output_shape = mace_pb2.OutputShape()
if self.device != 'neon':
wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1])
else:
wt_output_width = output_shape[0] * ((output_shape[2] + 1)/2) * ((output_shape[3]+1)/2)
wt_output_shape.dims.extend([16, filter_shape[1], wt_output_width, 1])
wt_op.output_shape.extend([wt_output_shape])
# MatMul
matmul_op = mace_pb2.OperatorDef()
arg = matmul_op.arg.add()
arg.name = 'T'
arg.i = self.dt
matmul_op.name = op.name + '_matmul'
matmul_op.type = 'MatMul'
matmul_op.input.extend([filter_name, wt_output_name])
matmul_output_name = matmul_op.name + ":0"
matmul_op.output.extend([matmul_output_name])
matmul_output_shape = mace_pb2.OutputShape()
if self.device != 'neon':
matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
else:
matmul_output_shape.dims.extend([16, filter_shape[0], wt_output_width, 1])
matmul_op.output_shape.extend([matmul_output_shape])
# Inverse transform
iwt_op = mace_pb2.OperatorDef()
arg = iwt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
batch_arg = iwt_op.arg.add()
batch_arg.name = 'batch'
batch_arg.i = output_shape[0]
height_arg = iwt_op.arg.add()
height_arg.name = 'height'
height_arg.i = output_shape[1] if self.device != 'neon' else output_shape[2]
width_arg = iwt_op.arg.add()
width_arg.name = 'width'
width_arg.i = output_shape[2] if self.device != 'neon' else output_shape[3]
iwt_op.name = op.name + '_inverse_transform'
iwt_op.type = 'WinogradInverseTransform'
iwt_op.input.extend([matmul_output_name])
# Add Bias
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0'
bias_data = op.data[1].reshape(-1)
self.add_tensor(bias_tensor_name, bias_data)
output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT")
iwt_op.input.extend([output_name])
final_op = op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(op.name)
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = iwt_op.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
iwt_op.output.extend([final_op.name+':0'])
self.add_output_shape(iwt_op, output_shape)
self.net_def.op.extend([wt_op, matmul_op, iwt_op])
def convert_batchnorm(self, op):
if len(op.children) != 1 or op.children[0].type != 'Scale':
raise Exception('Now only support BatchNorm+Scale')
op_def = self.CommonConvert(op, 'FoldedBatchNorm')
scale_op = op.children[0]
epsilon_value = op.layer.batch_norm_param.eps
if op.data[2][0] != 0:
mean_value = (1. / op.data[2][0]) * op.data[0]
var_value = (1. / op.data[2][0]) * op.data[1]
else:
raise RuntimeError('scalar is zero.')
gamma_value = scale_op.data[0]
beta_value = np.zeros_like(mean_value)
if len(scale_op.data) == 2:
beta_value = scale_op.data[1]
scale_value = (
(1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
gamma_value).reshape(-1)
offset_value = ((-mean_value * scale_value) + beta_value).reshape(-1)
input_names = [op.name+'_scale:0', op.name+'_offset:0']
self.add_tensor(input_names[0], scale_value)
self.add_tensor(input_names[1], offset_value)
if self.device == 'gpu':
for name in input_names:
output_name = self.add_buffer_to_image(name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([name for name in input_names])
self.resolved_ops.add(op.name)
self.resolved_ops.add(scale_op.name)
final_op = scale_op
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def convert_inner_product(self, op):
param = op.layer.inner_product_param
try:
if param.axis != 1 or param.transpose:
raise ValueError('Do not support non-default axis and transpose '
'case for innner product')
except AttributeError:
pass
op_def = self.CommonConvert(op, 'FC')
weight_tensor_name = op.name + '_weight:0'
if op.data[0].ndim not in [2, 4]:
raise ValueError('Unexpected weigth ndim.')
if op.data[0].ndim == 4 and list(op.data[0].shape[:2]) != [1, 1]:
raise ValueError('Do not support 4D weight with shape [1, 1, *, *]')
input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
weight_data = op.data[0].reshape(-1, op.data[0].shape[-1])
assert weight_data.shape[1] == (input_shape[1] * input_shape[2] * input_shape[3])
if self.device != 'neon':
weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2])
weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1)
self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'gpu':
if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE \
and (weight_data.shape[1] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE:
raise Exception('Mace gpu do not support FC with weight shape: '
+str(weight_data.shape))
if input_shape[3] % 4 == 0:
buffer_type = "WEIGHT_WIDTH"
else:
buffer_type = "WEIGHT_HEIGHT"
weight_type_arg = op_def.arg.add()
weight_type_arg.name = 'weight_type'
weight_type_arg.i = buffer_type_map['WEIGHT_HEIGHT']
if buffer_type == "WEIGHT_HEIGHT" and \
(weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE:
raise Exception('Mace gpu do not support FC with weight shape: '
+str(weight_data.shape))
output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
op_def.input.extend([output_name])
else:
op_def.input.extend([weight_tensor_name])
# Add Bias
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0'
bias_data = op.data[1].reshape(-1)
self.add_tensor(bias_tensor_name, bias_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([bias_tensor_name])
self.resolved_ops.add(op.name)
output_shape = Shapes.fully_connected_shape(input_shape, weight_data.shape)
op.output_shape_map[op.layer.top[0]] = output_shape
final_op = op
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def convert_pooling(self, op):
op_def = self.CommonConvert(op, 'Pooling')
param = op.layer.pooling_param
paddings, strides, kernels = self.add_stride_pad_kernel_arg(param, op_def)
if param.pool == caffe_pb2.PoolingParameter.MAX:
pooling_type = "MaxPool"
elif param.pool == caffe_pb2.PoolingParameter.AVE:
pooling_type = "AvgPool"
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[pooling_type]
input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
if param.HasField('global_pooling') and param.global_pooling:
kernels = [input_shape[1], input_shape[2]]
kernel_arg = op_def.arg.add()
kernel_arg.name = 'kernels'
kernel_arg.ints.extend(kernels)
filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]] \
if self.device != 'neon' else \
[input_shape[1], input_shape[1], kernels[0], kernels[1]]
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
paddings, strides, [1, 1], math.ceil, input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
op_def.output.extend([op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_activation(self, op):
op_def = self.CommonConvert(op, 'Activation')
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = activation_name_map[op.type]
op_def.output.extend([op.name + ':0'])
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_prelu(self, op):
op_def = self.CommonConvert(op, 'Activation')
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = 'PRELU'
alpha_tensor_name = op.name + '_alpha:0'
alpha_data = op.data[0].reshape(-1)
self.add_tensor(alpha_tensor_name, alpha_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(alpha_tensor_name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([alpha_tensor_name])
op_def.output.extend([op.name + ':0'])
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_add(self, op):
op_def = self.CommonConvert(op, 'AddN')
op_def.output.extend([op.name + ':0'])
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_concat(self, op):
op_def = self.CommonConvert(op, 'Concat')
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = 3 if self.device != 'neon' else 1
try:
if op.layer.concat_param.HasFeild('axis'):
axis_arg.i = op.concat_param.axis
elif op.layer.concat_param.HasFeild('concat_dim'):
axis_arg.i = op.concat_param.concat_dim
except AttributeError:
pass
input_shapes = [] input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
for i in range(len(op.parents)): output_shape = Shapes.conv_pool_shape(
input_shapes.append(op.parents[i].output_shape_map[op.layer.bottom[i]]) op.get_single_parent().output_shape_map[op.layer.bottom[0]],
output_shape = Shapes.concat_shape(input_shapes, axis_arg.i) weight_data.shape, paddings, strides, dilations, math.floor,
op.output_shape_map[op.layer.top[0]] = output_shape input_format)
self.add_output_shape(op_def, output_shape) op.output_shape_map[op.layer.top[0]] = output_shape
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def]) if len(self.ops_map[final_op.name].children) == 1 and \
self.resolved_ops.add(op.name) self.ops_map[final_op.name].children[0].type \
in activation_name_map:
def convert_eltwise(self, op): activation_op = self.ops_map[final_op.name].children[0]
op_def = self.CommonConvert(op, 'Eltwise') if not is_depthwise:
param = op.layer.eltwise_param op_def.type = "FusedConv2D"
type_arg = op_def.arg.add() fused_act_arg = op_def.arg.add()
type_arg.name = 'type' fused_act_arg.name = 'activation'
type_arg.i = param.operation fused_act_arg.s = activation_name_map[activation_op.type]
if len(param.coeff) > 0: final_op = activation_op
coeff_arg = op_def.arg.add() final_op.output_shape_map[final_op.layer.top[0]] = output_shape
coeff_arg.name = 'coeff' self.resolved_ops.add(activation_op.name)
coeff_arg.ints.extend(list(param.coeff))
op_def.output.extend([final_op.name + ':0'])
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] self.add_output_shape(op_def, output_shape)
op.output_shape_map[op.layer.top[0]] = output_shape self.net_def.op.extend([op_def])
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0']) def check_winograd_conv(self, op):
self.net_def.op.extend([op_def]) # TODO: support winograd conv on neon
self.resolved_ops.add(op.name) if self.device == 'neon':
return False
def convert_slice(self, op): param = op.layer.convolution_param
op_def = self.CommonConvert(op, 'Slice') filter_shape = np.asarray(op.data[0].shape)
if op.layer.HasField('slice_param'): if self.device != 'neon':
param = op.layer.slice_param filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
if param.HasField('axis') and param.axis != 1: paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
raise Exception('Mace do not support slice with axis ' + str(param.axis))
if len(param.slice_point) > 0: dilations = [1, 1]
raise Exception('Mace do not support slice with slice_point') if len(param.dilation) > 0:
if len(param.dilation) == 1:
axis_arg = op_def.arg.add() dilations = [param.dilation[0], param.dilation[0]]
axis_arg.name = 'axis' elif len(param.dilation) == 2:
axis_arg.i = 3 if self.device != 'neon' else 1 dilations = [param.dilation[0], param.dilation[1]]
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
num_outputs = len(op.layer.top) output_shape = Shapes.conv_pool_shape(
input_channels = input_shape[axis_arg.i] op.get_single_parent().output_shape_map[op.layer.bottom[0]],
if (input_channels % num_outputs) != 0 or \ filter_shape, paddings, strides, dilations, math.floor,
(self.device == 'gpu' and ((input_channels / num_outputs) % 4 != 0)): input_format)
raise Exception('Mace do not support slice with input shape ' width = output_shape[0] * ((output_shape[1] + 1) / 2) * ((
+ str(input_shape) + ' and number of output ' + str(num_outputs)) output_shape[2] + 1) / 2)
input_format = 'NCHW' if self.device == 'neon' else 'NHWC' if self.winograd and dilations[0] == 1 and \
output_shape = Shapes.slice_shape(input_shape, num_outputs, input_format) (dilations[0] == dilations[1]) and \
for i in range(len(op.layer.top)): (strides[0] == 1) and (strides[0] == strides[1]):
op.output_shape_map[op.layer.top[i]] = output_shape if self.device == 'gpu':
self.add_output_shape(op_def, output_shape) return filter_shape[0] == 3 and \
op_def.output.extend([op.name + '_' + str(i) + ':0']) (filter_shape[0] == filter_shape[1]) and \
self.net_def.op.extend([op_def]) (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
self.resolved_ops.add(op.name) (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
def convert_normal_op(self, op): elif self.device == 'neon':
op_def = self.CommonConvert(op, op.type) return filter_shape[2] == 3 and (
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] filter_shape[2] == filter_shape[3])
op.output_shape_map[op.layer.top[0]] = output_shape return False
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0']) def convert_winograd_conv(self, op):
self.net_def.op.extend([op_def]) # Add filter
self.resolved_ops.add(op.name) weight_tensor_name = op.name + '_weight:0'
self.add_tensor(weight_tensor_name, op.data[0])
def convert_reshape(self, op):
if self.device == 'neon': buffer_type = "WINOGRAD_FILTER"
op_def = self.CommonConvert(op, 'Reshape') filter_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
else:
op_def = self.CommonConvert(op, 'ReOrganize') param = op.layer.convolution_param
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
output_shape = input_shape
shape_param = np.asarray(op.layer.reshape_param.shape.dim) filter_shape = np.asarray(op.data[0].shape)
if self.device != 'neon': if self.device != 'neon':
shape_param = shape_param[[0, 3, 1, 2]] filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
for i in range(len(shape_param)):
if shape_param[i] != 0: input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape[i] = shape_param[i] output_shape = Shapes.conv_pool_shape(
shape_arg = op_def.arg.add() op.get_single_parent().output_shape_map[op.layer.bottom[0]],
shape_arg.name = 'shape' filter_shape, paddings, strides, [1, 1], math.floor, input_format)
shape_arg.ints.extend(output_shape)
op.output_shape_map[op.layer.top[0]] = output_shape # Input transform
self.add_output_shape(op_def, output_shape) wt_op = mace_pb2.OperatorDef()
op_def.output.extend([op.name + ':0']) arg = wt_op.arg.add()
self.net_def.op.extend([op_def]) arg.name = 'T'
self.resolved_ops.add(op.name) arg.i = self.dt
padding_arg = wt_op.arg.add()
def convert_proposal_op(self, op): padding_arg.name = 'padding_values'
assert self.device == 'cpu' padding_arg.ints.extend(paddings)
op_def = self.CommonConvert(op, op.type) wt_op.name = op.name + '_input_transform'
if op.layer.HasField('proposal_param'): wt_op.type = 'WinogradTransform'
proposal_param = op.layer.proposal_param wt_op.input.extend([name + ':0' for name in self.inputs_map[op.name]])
feat_stride_arg = op_def.arg.add() wt_output_name = wt_op.name + ":0"
feat_stride_arg.name = 'feat_stride' wt_op.output.extend([wt_output_name])
feat_stride_arg.i = proposal_param.feat_stride wt_output_shape = mace_pb2.OutputShape()
scales_arg = op_def.arg.add() if self.device != 'neon':
scales_arg.name = 'scales' wt_output_width = output_shape[0] * ((
scales_arg.ints.extend(list(proposal_param.scales)) output_shape[1] + 1) / 2) * ((output_shape[2] + 1) / 2)
ratios_arg = op_def.arg.add() wt_output_shape.dims.extend(
ratios_arg.name = 'ratios' [16, filter_shape[3], wt_output_width, 1])
ratios_arg.floats.extend(list(proposal_param.ratios)) else:
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] wt_output_width = output_shape[0] * ((
op.output_shape_map[op.layer.top[0]] = output_shape output_shape[2] + 1) / 2) * ((output_shape[3] + 1) / 2)
self.add_output_shape(op_def, output_shape) wt_output_shape.dims.extend(
op_def.output.extend([op.name + ':0']) [16, filter_shape[1], wt_output_width, 1])
self.net_def.op.extend([op_def]) wt_op.output_shape.extend([wt_output_shape])
self.resolved_ops.add(op.name)
# MatMul
def convert_psroi_align(self, op): matmul_op = mace_pb2.OperatorDef()
assert self.device == 'cpu' arg = matmul_op.arg.add()
op_def = self.CommonConvert(op, op.type) arg.name = 'T'
if op.layer.HasField('psroi_align_param'): arg.i = self.dt
psroi_align_param = op.layer.psroi_align_param matmul_op.name = op.name + '_matmul'
spatial_scale_arg = op_def.arg.add() matmul_op.type = 'MatMul'
spatial_scale_arg.name = 'spatial_scale' matmul_op.input.extend([filter_name, wt_output_name])
spatial_scale_arg.f = psroi_align_param.spatial_scale matmul_output_name = matmul_op.name + ":0"
output_dim_arg = op_def.arg.add() matmul_op.output.extend([matmul_output_name])
output_dim_arg.name = 'output_dim' matmul_output_shape = mace_pb2.OutputShape()
output_dim_arg.i = psroi_align_param.output_dim if self.device != 'neon':
group_size_arg = op_def.arg.add() matmul_output_shape.dims.extend(
group_size_arg.name = 'group_size' [16, filter_shape[2], wt_output_width, 1])
group_size_arg.i = psroi_align_param.group_size else:
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] matmul_output_shape.dims.extend(
op.output_shape_map[op.layer.top[0]] = output_shape [16, filter_shape[0], wt_output_width, 1])
self.add_output_shape(op_def, output_shape) matmul_op.output_shape.extend([matmul_output_shape])
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def]) # Inverse transform
self.resolved_ops.add(op.name) iwt_op = mace_pb2.OperatorDef()
arg = iwt_op.arg.add()
def replace_in_out_name(self, input_names, output_names): arg.name = 'T'
in_names = set([input_name + ":0" for input_name in input_names]) arg.i = self.dt
out_names = set([output_name + ":0" for output_name in output_names]) batch_arg = iwt_op.arg.add()
for op in self.net_def.op: batch_arg.name = 'batch'
for i in range(len(op.input)): batch_arg.i = output_shape[0]
if op.input[i] in in_names: height_arg = iwt_op.arg.add()
op.input[i] = MACE_INPUT_NODE_NAME + '_' + op.input[i] height_arg.name = 'height'
if op.input[i] in out_names: height_arg.i = output_shape[
op.input[i] = MACE_OUTPUT_NODE_NAME + '_' + op.input[i] 1] if self.device != 'neon' else output_shape[2]
for i in range(len(op.output)): width_arg = iwt_op.arg.add()
if op.output[i] in in_names: width_arg.name = 'width'
op.output[i] = MACE_INPUT_NODE_NAME + '_' + op.output[i] width_arg.i = output_shape[
if op.output[i] in out_names: 2] if self.device != 'neon' else output_shape[3]
op.output[i] = MACE_OUTPUT_NODE_NAME + '_' + op.output[i] iwt_op.name = op.name + '_inverse_transform'
iwt_op.type = 'WinogradInverseTransform'
def add_input_op_shape(self, input_nodes, input_shapes): iwt_op.input.extend([matmul_output_name])
assert len(input_nodes) == len(input_shapes)
for i in range(len(input_nodes)): # Add Bias
input_op = self.ops_map[input_nodes[i]] if len(op.data) == 2:
input_shape = input_shapes[i] if self.device != 'neon' else \ bias_tensor_name = op.name + '_bias:0'
[input_shapes[i][0], input_shapes[i][3], input_shapes[i][1], input_shapes[i][2]] bias_data = op.data[1].reshape(-1)
if input_op.layer is not None: self.add_tensor(bias_tensor_name, bias_data)
input_op.output_shape_map[input_op.layer.top[0]] = input_shape output_name = self.add_buffer_to_image(bias_tensor_name,
else: "ARGUMENT")
input_op.output_shape_map[input_op.name] = input_shape iwt_op.input.extend([output_name])
def add_neon_input_transform(self, names): final_op = op
for name in names: final_op.output_shape_map[final_op.layer.top[0]] = output_shape
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'Transpose'
op_def.input.extend([new_input_name])
op_def.output.extend([name+':0'])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 3, 1, 2]) # NHWC -> NCHW
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
def add_neon_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'Transpose'
op_def.input.extend([name+':0'])
op_def.output.extend([output_name])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC
def convert(self, input_nodes, input_shapes, output_nodes):
if self.device == 'gpu':
self.add_input_transform(input_nodes)
if self.device == 'neon':
self.add_neon_input_transform(input_nodes)
assert self.ops[0].type == 'Input'
self.add_input_op_shape(input_nodes, input_shapes)
for op in self.ops:
if op.name in self.resolved_ops:
continue
if op.type == 'Input':
self.resolved_ops.add(op.name) self.resolved_ops.add(op.name)
elif op.type == 'Convolution':
if self.check_winograd_conv(op): if len(self.ops_map[final_op.name].children) == 1 and \
self.convert_winograd_conv(op) self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = iwt_op.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
iwt_op.output.extend([final_op.name + ':0'])
self.add_output_shape(iwt_op, output_shape)
self.net_def.op.extend([wt_op, matmul_op, iwt_op])
def convert_batchnorm(self, op):
if len(op.children) != 1 or op.children[0].type != 'Scale':
raise Exception('Now only support BatchNorm+Scale')
op_def = self.CommonConvert(op, 'FoldedBatchNorm')
scale_op = op.children[0]
epsilon_value = op.layer.batch_norm_param.eps
if op.data[2][0] != 0:
mean_value = (1. / op.data[2][0]) * op.data[0]
var_value = (1. / op.data[2][0]) * op.data[1]
else: else:
self.convert_conv2d(op) raise RuntimeError('scalar is zero.')
elif op.type == 'BatchNorm':
self.convert_batchnorm(op) gamma_value = scale_op.data[0]
elif op.type == 'InnerProduct': beta_value = np.zeros_like(mean_value)
self.convert_inner_product(op) if len(scale_op.data) == 2:
elif op.type == 'Pooling': beta_value = scale_op.data[1]
self.convert_pooling(op)
elif op.type == 'PReLU': scale_value = ((
self.convert_prelu(op) 1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
elif op.type in ['ReLU', 'Sigmoid', 'TanH']: gamma_value).reshape(-1)
self.convert_activation(op) offset_value = ((-mean_value * scale_value) + beta_value).reshape(-1)
elif op.type == 'Add': input_names = [op.name + '_scale:0', op.name + '_offset:0']
self.convert_add(op) self.add_tensor(input_names[0], scale_value)
elif op.type == 'Concat': self.add_tensor(input_names[1], offset_value)
self.convert_concat(op)
elif op.type == 'Eltwise': if self.device == 'gpu':
self.convert_eltwise(op) for name in input_names:
elif op.type == 'Slice': output_name = self.add_buffer_to_image(name, "ARGUMENT")
self.convert_slice(op) op_def.input.extend([output_name])
elif op.type == 'Reshape': else:
self.convert_reshape(op) op_def.input.extend([name for name in input_names])
elif op.type == 'Proposal':
self.convert_proposal_op(op) self.resolved_ops.add(op.name)
elif op.type == 'PSROIAlign': self.resolved_ops.add(scale_op.name)
self.convert_psroi_align(op) final_op = scale_op
elif op.type in ['Softmax']:
self.convert_normal_op(op) output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
else: 0]]
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
if len(self.ops_map[final_op.name].children) == 1 \
if self.device == 'gpu': and self.ops_map[final_op.name].children[0].type \
self.add_output_transform(output_nodes) in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
if self.device == 'cpu': fused_act_arg = op_def.arg.add()
self.replace_in_out_name(input_nodes, output_nodes) fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if self.device == 'neon': final_op = activation_op
self.add_neon_output_transform(output_nodes) final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
for op in self.ops:
if op.name not in self.resolved_ops: op_def.output.extend([final_op.name + ':0'])
print 'Unresolve Op: %s with type %s' % (op.name, op.type) self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def convert_to_mace_pb(model_file, weight_file, input_node_str, input_shape_str, def convert_inner_product(self, op):
output_node_str, data_type, device, winograd): param = op.layer.inner_product_param
net_def = mace_pb2.NetDef() try:
dt = data_type_map[data_type] if param.axis != 1 or param.transpose:
raise ValueError(
caffe_net = caffe_pb2.NetParameter() 'Do not support non-default axis and transpose '
with open(model_file, "r") as f: 'case for innner product')
google.protobuf.text_format.Merge(str(f.read()), caffe_net) except AttributeError:
pass
weights = caffe_pb2.NetParameter()
with open(weight_file, "rb") as f: op_def = self.CommonConvert(op, 'FC')
weights.MergeFromString(f.read()) weight_tensor_name = op.name + '_weight:0'
if op.data[0].ndim not in [2, 4]:
input_nodes = [x for x in input_node_str.split(',')] raise ValueError('Unexpected weigth ndim.')
input_shapes = [] if op.data[0].ndim == 4 and list(op.data[0].shape[:2]) != [1, 1]:
if input_shape_str != "": raise ValueError(
input_shape_strs = [x for x in input_shape_str.split(':')] 'Do not support 4D weight with shape [1, 1, *, *]')
for shape_str in input_shape_strs: input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
input_shapes.extend([[int(x) for x in shape_str.split(',')]]) 0]]
output_nodes = [x for x in output_node_str.split(',')]
assert len(input_nodes) == len(input_shapes) weight_data = op.data[0].reshape(-1, op.data[0].shape[-1])
assert weight_data.shape[1] == (
converter = CaffeConverter(caffe_net, weights, net_def, dt, device, winograd) input_shape[1] * input_shape[2] * input_shape[3])
converter.convert(input_nodes, input_shapes, output_nodes) if self.device != 'neon':
print "PB Converted." weight_data = weight_data.reshape(-1, input_shape[3],
if device == 'gpu': input_shape[1], input_shape[2])
print "start optimize memory." weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(
mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) weight_data.shape[0], -1)
mem_optimizer.optimize() self.add_tensor(weight_tensor_name, weight_data)
print "Memory optimization done." if self.device == 'gpu':
if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE and \
return net_def (weight_data.shape[1] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE:
raise Exception(
'Mace gpu do not support FC with weight shape: ' +
str(weight_data.shape))
if input_shape[3] % 4 == 0:
buffer_type = "WEIGHT_WIDTH"
else:
buffer_type = "WEIGHT_HEIGHT"
weight_type_arg = op_def.arg.add()
weight_type_arg.name = 'weight_type'
weight_type_arg.i = buffer_type_map['WEIGHT_HEIGHT']
if buffer_type == "WEIGHT_HEIGHT" and \
(weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE:
raise Exception(
'Mace gpu do not support FC with weight shape: ' +
str(weight_data.shape))
output_name = self.add_buffer_to_image(weight_tensor_name,
buffer_type)
op_def.input.extend([output_name])
else:
op_def.input.extend([weight_tensor_name])
# Add Bias
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0'
bias_data = op.data[1].reshape(-1)
self.add_tensor(bias_tensor_name, bias_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(bias_tensor_name,
"ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([bias_tensor_name])
self.resolved_ops.add(op.name)
output_shape = Shapes.fully_connected_shape(input_shape,
weight_data.shape)
op.output_shape_map[op.layer.top[0]] = output_shape
final_op = op
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def convert_pooling(self, op):
op_def = self.CommonConvert(op, 'Pooling')
param = op.layer.pooling_param
paddings, strides, kernels = self.add_stride_pad_kernel_arg(
param, op_def)
if param.pool == caffe_pb2.PoolingParameter.MAX:
pooling_type = "MaxPool"
elif param.pool == caffe_pb2.PoolingParameter.AVE:
pooling_type = "AvgPool"
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[pooling_type]
input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
if param.HasField('global_pooling') and param.global_pooling:
kernels = [input_shape[1], input_shape[2]]
kernel_arg = op_def.arg.add()
kernel_arg.name = 'kernels'
kernel_arg.ints.extend(kernels)
if self.device != 'neon':
filter_shape = [
kernels[0], kernels[1], input_shape[3], input_shape[3]
]
else:
filter_shape = [
input_shape[1], input_shape[1], kernels[0], kernels[1]
]
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
paddings, strides, [1, 1],
math.ceil, input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
op_def.output.extend([op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_activation(self, op):
op_def = self.CommonConvert(op, 'Activation')
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = activation_name_map[op.type]
op_def.output.extend([op.name + ':0'])
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_prelu(self, op):
op_def = self.CommonConvert(op, 'Activation')
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = 'PRELU'
alpha_tensor_name = op.name + '_alpha:0'
alpha_data = op.data[0].reshape(-1)
self.add_tensor(alpha_tensor_name, alpha_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(alpha_tensor_name,
"ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([alpha_tensor_name])
op_def.output.extend([op.name + ':0'])
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_add(self, op):
op_def = self.CommonConvert(op, 'AddN')
op_def.output.extend([op.name + ':0'])
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_concat(self, op):
op_def = self.CommonConvert(op, 'Concat')
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = 3 if self.device != 'neon' else 1
try:
if op.layer.concat_param.HasFeild('axis'):
axis_arg.i = op.concat_param.axis
elif op.layer.concat_param.HasFeild('concat_dim'):
axis_arg.i = op.concat_param.concat_dim
except AttributeError:
pass
input_shapes = []
for i in range(len(op.parents)):
input_shapes.append(
op.parents[i].output_shape_map[op.layer.bottom[i]])
output_shape = Shapes.concat_shape(input_shapes, axis_arg.i)
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_eltwise(self, op):
op_def = self.CommonConvert(op, 'Eltwise')
param = op.layer.eltwise_param
type_arg = op_def.arg.add()
type_arg.name = 'type'
type_arg.i = param.operation
if len(param.coeff) > 0:
coeff_arg = op_def.arg.add()
coeff_arg.name = 'coeff'
coeff_arg.ints.extend(list(param.coeff))
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_slice(self, op):
op_def = self.CommonConvert(op, 'Slice')
if op.layer.HasField('slice_param'):
param = op.layer.slice_param
if param.HasField('axis') and param.axis != 1:
raise Exception(
'Mace do not support slice with axis ' + str(param.axis))
if len(param.slice_point) > 0:
raise Exception('Mace do not support slice with slice_point')
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = 3 if self.device != 'neon' else 1
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
num_outputs = len(op.layer.top)
input_channels = input_shape[axis_arg.i]
if (input_channels % num_outputs) != 0 or \
(self.device == 'gpu' and
((input_channels / num_outputs) % 4 != 0)):
raise Exception(
'Mace do not support slice with input shape ' +
str(input_shape) + ' and number of output ' + str(num_outputs))
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.slice_shape(input_shape, num_outputs,
input_format)
for i in range(len(op.layer.top)):
op.output_shape_map[op.layer.top[i]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + '_' + str(i) + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_normal_op(self, op):
op_def = self.CommonConvert(op, op.type)
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_reshape(self, op):
if self.device == 'neon':
op_def = self.CommonConvert(op, 'Reshape')
else:
op_def = self.CommonConvert(op, 'ReOrganize')
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
output_shape = input_shape
shape_param = np.asarray(op.layer.reshape_param.shape.dim)
if self.device != 'neon':
shape_param = shape_param[[0, 3, 1, 2]]
for i in range(len(shape_param)):
if shape_param[i] != 0:
output_shape[i] = shape_param[i]
shape_arg = op_def.arg.add()
shape_arg.name = 'shape'
shape_arg.ints.extend(output_shape)
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_proposal_op(self, op):
assert self.device == 'cpu'
op_def = self.CommonConvert(op, op.type)
if op.layer.HasField('proposal_param'):
proposal_param = op.layer.proposal_param
feat_stride_arg = op_def.arg.add()
feat_stride_arg.name = 'feat_stride'
feat_stride_arg.i = proposal_param.feat_stride
scales_arg = op_def.arg.add()
scales_arg.name = 'scales'
scales_arg.ints.extend(list(proposal_param.scales))
ratios_arg = op_def.arg.add()
ratios_arg.name = 'ratios'
ratios_arg.floats.extend(list(proposal_param.ratios))
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_psroi_align(self, op):
assert self.device == 'cpu'
op_def = self.CommonConvert(op, op.type)
if op.layer.HasField('psroi_align_param'):
psroi_align_param = op.layer.psroi_align_param
spatial_scale_arg = op_def.arg.add()
spatial_scale_arg.name = 'spatial_scale'
spatial_scale_arg.f = psroi_align_param.spatial_scale
output_dim_arg = op_def.arg.add()
output_dim_arg.name = 'output_dim'
output_dim_arg.i = psroi_align_param.output_dim
group_size_arg = op_def.arg.add()
group_size_arg.name = 'group_size'
group_size_arg.i = psroi_align_param.group_size
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def replace_in_out_name(self, input_names, output_names):
in_names = set([input_name + ":0" for input_name in input_names])
out_names = set([output_name + ":0" for output_name in output_names])
for op in self.net_def.op:
for i in range(len(op.input)):
if op.input[i] in in_names:
op.input[i] = MACE_INPUT_NODE_NAME + '_' + op.input[i]
if op.input[i] in out_names:
op.input[i] = MACE_OUTPUT_NODE_NAME + '_' + op.input[i]
for i in range(len(op.output)):
if op.output[i] in in_names:
op.output[i] = MACE_INPUT_NODE_NAME + '_' + op.output[i]
if op.output[i] in out_names:
op.output[i] = MACE_OUTPUT_NODE_NAME + '_' + op.output[i]
def add_input_op_shape(self, input_nodes, input_shapes):
assert len(input_nodes) == len(input_shapes)
for i in range(len(input_nodes)):
input_op = self.ops_map[input_nodes[i]]
input_shape = input_shapes[i] if self.device != 'neon' else \
[input_shapes[i][0], input_shapes[i][3],
input_shapes[i][1], input_shapes[i][2]]
if input_op.layer is not None:
input_op.output_shape_map[input_op.layer.top[0]] = input_shape
else:
input_op.output_shape_map[input_op.name] = input_shape
def add_neon_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'Transpose'
op_def.input.extend([new_input_name])
op_def.output.extend([name + ':0'])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 3, 1, 2]) # NHWC -> NCHW
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
def add_neon_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'Transpose'
op_def.input.extend([name + ':0'])
op_def.output.extend([output_name])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC
def convert(self, input_nodes, input_shapes, output_nodes):
if self.device == 'gpu':
self.add_input_transform(input_nodes)
if self.device == 'neon':
self.add_neon_input_transform(input_nodes)
assert self.ops[0].type == 'Input'
self.add_input_op_shape(input_nodes, input_shapes)
for op in self.ops:
if op.name in self.resolved_ops:
continue
if op.type == 'Input':
self.resolved_ops.add(op.name)
elif op.type == 'Convolution':
if self.check_winograd_conv(op):
self.convert_winograd_conv(op)
else:
self.convert_conv2d(op)
elif op.type == 'BatchNorm':
self.convert_batchnorm(op)
elif op.type == 'InnerProduct':
self.convert_inner_product(op)
elif op.type == 'Pooling':
self.convert_pooling(op)
elif op.type == 'PReLU':
self.convert_prelu(op)
elif op.type in ['ReLU', 'Sigmoid', 'TanH']:
self.convert_activation(op)
elif op.type == 'Add':
self.convert_add(op)
elif op.type == 'Concat':
self.convert_concat(op)
elif op.type == 'Eltwise':
self.convert_eltwise(op)
elif op.type == 'Slice':
self.convert_slice(op)
elif op.type == 'Reshape':
self.convert_reshape(op)
elif op.type == 'Proposal':
self.convert_proposal_op(op)
elif op.type == 'PSROIAlign':
self.convert_psroi_align(op)
elif op.type in ['Softmax']:
self.convert_normal_op(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name,
op.type))
if self.device == 'gpu':
self.add_output_transform(output_nodes)
if self.device == 'cpu':
self.replace_in_out_name(input_nodes, output_nodes)
if self.device == 'neon':
self.add_neon_output_transform(output_nodes)
for op in self.ops:
if op.name not in self.resolved_ops:
print 'Unresolve Op: %s with type %s' % (op.name, op.type)
def convert_to_mace_pb(model_file, weight_file, input_node_str,
input_shape_str, output_node_str, data_type, device,
winograd):
net_def = mace_pb2.NetDef()
dt = data_type_map[data_type]
caffe_net = caffe_pb2.NetParameter()
with open(model_file, "r") as f:
google.protobuf.text_format.Merge(str(f.read()), caffe_net)
weights = caffe_pb2.NetParameter()
with open(weight_file, "rb") as f:
weights.MergeFromString(f.read())
input_nodes = [x for x in input_node_str.split(',')]
input_shapes = []
if input_shape_str != "":
input_shape_strs = [x for x in input_shape_str.split(':')]
for shape_str in input_shape_strs:
input_shapes.extend([[int(x) for x in shape_str.split(',')]])
output_nodes = [x for x in output_node_str.split(',')]
assert len(input_nodes) == len(input_shapes)
converter = CaffeConverter(caffe_net, weights, net_def, dt, device,
winograd)
converter.convert(input_nodes, input_shapes, output_nodes)
print "PB Converted."
if device == 'gpu':
print "start optimize memory."
mem_optimizer = memory_optimizer.MemoryOptimizer(net_def)
mem_optimizer.optimize()
print "Memory optimization done."
return net_def
...@@ -26,4 +26,3 @@ def tf_dtype_2_mace_dtype(tf_dtype): ...@@ -26,4 +26,3 @@ def tf_dtype_2_mace_dtype(tf_dtype):
if not mace_dtype: if not mace_dtype:
raise Exception("Not supported tensorflow dtype: " + tf_dtype) raise Exception("Not supported tensorflow dtype: " + tf_dtype)
return mace_dtype return mace_dtype
...@@ -4,176 +4,166 @@ import hashlib ...@@ -4,176 +4,166 @@ import hashlib
import os.path import os.path
from mace.python.tools import source_converter_lib from mace.python.tools import source_converter_lib
# ./bazel-bin/mace/python/tools/tf_converter --model_file quantized_test.pb --output quantized_test_dsp.pb --runtime dsp --input_dim input_node,1,28,28,3 # ./bazel-bin/mace/python/tools/tf_converter --model_file quantized_test.pb \
# --output quantized_test_dsp.pb \
# --runtime dsp \
# --input_dim input_node,1,28,28,3
FLAGS = None FLAGS = None
def file_checksum(fname): def file_checksum(fname):
hash_func = hashlib.sha256() hash_func = hashlib.sha256()
with open(fname, "rb") as f: with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""): for chunk in iter(lambda: f.read(4096), b""):
hash_func.update(chunk) hash_func.update(chunk)
return hash_func.hexdigest() return hash_func.hexdigest()
def main(unused_args): def main(unused_args):
if not os.path.isfile(FLAGS.model_file): if not os.path.isfile(FLAGS.model_file):
print("Input graph file '" + FLAGS.model_file + "' does not exist!") print("Input graph file '" + FLAGS.model_file + "' does not exist!")
sys.exit(-1) sys.exit(-1)
model_checksum = file_checksum(FLAGS.model_file) model_checksum = file_checksum(FLAGS.model_file)
if FLAGS.model_checksum != "" and FLAGS.model_checksum != model_checksum: if FLAGS.model_checksum != "" and FLAGS.model_checksum != model_checksum:
print("Model checksum mismatch: %s != %s" % (model_checksum, FLAGS.model_checksum)) print("Model checksum mismatch: %s != %s" % (model_checksum,
sys.exit(-1) FLAGS.model_checksum))
sys.exit(-1)
if FLAGS.platform == 'caffe':
if not os.path.isfile(FLAGS.weight_file): if FLAGS.platform == 'caffe':
print("Input weight file '" + FLAGS.weight_file + "' does not exist!") if not os.path.isfile(FLAGS.weight_file):
sys.exit(-1) print("Input weight file '" + FLAGS.weight_file +
"' does not exist!")
weight_checksum = file_checksum(FLAGS.weight_file) sys.exit(-1)
if FLAGS.weight_checksum != "" and FLAGS.weight_checksum != weight_checksum:
print("Weight checksum mismatch: %s != %s" % (weight_checksum, FLAGS.weight_checksum)) weight_checksum = file_checksum(FLAGS.weight_file)
sys.exit(-1) if FLAGS.weight_checksum != "" and \
FLAGS.weight_checksum != weight_checksum:
if FLAGS.runtime == 'dsp': print("Weight checksum mismatch: %s != %s" %
print("DSP not support caffe model yet.") (weight_checksum, FLAGS.weight_checksum))
sys.exit(-1) sys.exit(-1)
from mace.python.tools import caffe_converter_lib if FLAGS.runtime == 'dsp':
output_graph_def = caffe_converter_lib.convert_to_mace_pb( print("DSP not support caffe model yet.")
FLAGS.model_file, FLAGS.weight_file, FLAGS.input_node, FLAGS.input_shape, FLAGS.output_node, sys.exit(-1)
FLAGS.data_type, FLAGS.runtime, FLAGS.winograd)
elif FLAGS.platform == 'tensorflow': from mace.python.tools import caffe_converter_lib
if FLAGS.runtime == 'dsp': output_graph_def = caffe_converter_lib.convert_to_mace_pb(
from mace.python.tools import tf_dsp_converter_lib FLAGS.model_file, FLAGS.weight_file, FLAGS.input_node,
output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb( FLAGS.input_shape, FLAGS.output_node, FLAGS.data_type,
FLAGS.model_file, FLAGS.input_node, FLAGS.output_node, FLAGS.dsp_mode) FLAGS.runtime, FLAGS.winograd)
elif FLAGS.platform == 'tensorflow':
if FLAGS.runtime == 'dsp':
from mace.python.tools import tf_dsp_converter_lib
output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.input_node, FLAGS.output_node,
FLAGS.dsp_mode)
else:
from mace.python.tools import tf_converter_lib
output_graph_def = tf_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.input_node, FLAGS.input_shape,
FLAGS.output_node, FLAGS.data_type, FLAGS.runtime,
FLAGS.winograd)
if FLAGS.output_type == 'source':
source_converter_lib.convert_to_source(
output_graph_def, model_checksum, FLAGS.template, FLAGS.obfuscate,
FLAGS.model_tag, FLAGS.output, FLAGS.runtime,
FLAGS.embed_model_data)
else: else:
from mace.python.tools import tf_converter_lib with open(FLAGS.output, "wb") as f:
output_graph_def = tf_converter_lib.convert_to_mace_pb( f.write(output_graph_def.SerializeToString())
FLAGS.model_file, FLAGS.input_node, FLAGS.input_shape, FLAGS.output_node, with open(FLAGS.output + '_txt', "wb") as f:
FLAGS.data_type, FLAGS.runtime, FLAGS.winograd) # output_graph_def.ClearField('tensors')
f.write(str(output_graph_def))
if FLAGS.output_type == 'source': print("Model conversion is completed.")
source_converter_lib.convert_to_source(output_graph_def, model_checksum, FLAGS.template, FLAGS.obfuscate,
FLAGS.model_tag, FLAGS.output, FLAGS.runtime, FLAGS.embed_model_data)
else:
with open(FLAGS.output, "wb") as f:
f.write(output_graph_def.SerializeToString())
with open(FLAGS.output + '_txt', "wb") as f:
# output_graph_def.ClearField('tensors')
f.write(str(output_graph_def))
print("Model conversion is completed.")
def str2bool(v): def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'): if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'): elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False return False
else: else:
raise argparse.ArgumentTypeError('Boolean value expected.') raise argparse.ArgumentTypeError('Boolean value expected.')
def parse_args(): def parse_args():
"""Parses command line arguments.""" """Parses command line arguments."""
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true") parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument( parser.add_argument(
"--model_file", "--model_file",
type=str, type=str,
default="", default="",
help="TensorFlow \'GraphDef\' file to load, Caffe prototxt file to load.") help="TensorFlow \'GraphDef\' file to load, "
parser.add_argument( "Caffe prototxt file to load.")
"--weight_file", parser.add_argument(
type=str, "--weight_file", type=str, default="", help="Caffe data file to load.")
default="", parser.add_argument(
help="Caffe data file to load.") "--model_checksum",
parser.add_argument( type=str,
"--model_checksum", default="",
type=str, help="Model file sha256 checksum")
default="", parser.add_argument(
help="Model file sha256 checksum") "--weight_checksum",
parser.add_argument( type=str,
"--weight_checksum", default="",
type=str, help="Weight file sha256 checksum")
default="", parser.add_argument(
help="Weight file sha256 checksum") "--output",
parser.add_argument( type=str,
"--output", default="",
type=str, help="File to save the output graph to.")
default="", parser.add_argument(
help="File to save the output graph to.") "--runtime", type=str, default="cpu", help="Runtime: cpu/gpu/dsp")
parser.add_argument( parser.add_argument(
"--runtime", "--input_node",
type=str, type=str,
default="cpu", default="input_node",
help="Runtime: cpu/gpu/dsp") help="e.g., input_node")
parser.add_argument( parser.add_argument(
"--input_node", "--output_node", type=str, default="softmax", help="e.g., softmax")
type=str, parser.add_argument(
default="input_node", "--data_type",
help="e.g., input_node") type=str,
parser.add_argument( default='DT_FLOAT',
"--output_node", help="e.g., DT_HALF/DT_FLOAT")
type=str, parser.add_argument(
default="softmax", "--output_type", type=str, default="pb", help="output type: source/pb")
help="e.g., softmax") parser.add_argument(
parser.add_argument( "--template", type=str, default="", help="template path")
"--data_type", parser.add_argument(
type=str, "--obfuscate",
default='DT_FLOAT', type=str2bool,
help="e.g., DT_HALF/DT_FLOAT") nargs='?',
parser.add_argument( const=False,
"--output_type", default=False,
type=str, help="obfuscate model names")
default="pb", parser.add_argument(
help="output type: source/pb") "--model_tag",
parser.add_argument( type=str,
"--template", default="",
type=str, help="model tag for generated function and namespace")
default="", parser.add_argument(
help="template path") "--winograd",
parser.add_argument( type=str2bool,
"--obfuscate", nargs='?',
type=str2bool, const=False,
nargs='?', default=False,
const=False, help="open winograd convolution or not")
default=False, parser.add_argument(
help="obfuscate model names") "--dsp_mode", type=int, default=0, help="dsp run mode, defalut=0")
parser.add_argument( parser.add_argument(
"--model_tag", "--input_shape", type=str, default="", help="input shape.")
type=str, parser.add_argument(
default="", "--platform", type=str, default="tensorflow", help="tensorflow/caffe")
help="model tag for generated function and namespace") parser.add_argument(
parser.add_argument( "--embed_model_data", type=str2bool, default=True, help="input shape.")
"--winograd", return parser.parse_known_args()
type=str2bool,
nargs='?',
const=False,
default=False,
help="open winograd convolution or not")
parser.add_argument(
"--dsp_mode",
type=int,
default=0,
help="dsp run mode, defalut=0")
parser.add_argument(
"--input_shape",
type=str,
default="",
help="input shape.")
parser.add_argument(
"--platform",
type=str,
default="tensorflow",
help="tensorflow/caffe")
parser.add_argument(
"--embed_model_data",
type=str2bool,
default=True,
help="input shape.")
return parser.parse_known_args()
if __name__ == '__main__': if __name__ == '__main__':
FLAGS, unparsed = parse_args() FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed) main(unused_args=[sys.argv[0]] + unparsed)
class DspOps(object): class DspOps(object):
def __init__(self): def __init__(self):
self.dsp_ops = { self.dsp_ops = {
'INPUT': 'INPUT"', 'INPUT': 'INPUT"',
'OUTPUT': 'OUTPUT', 'OUTPUT': 'OUTPUT',
'NoOp': 'Nop', 'NoOp': 'Nop',
'FLATTEN': 'Flatten', 'FLATTEN': 'Flatten',
'Identity': 'Nop', 'Identity': 'Nop',
'Placeholder': 'INPUT', 'Placeholder': 'INPUT',
'Const': 'Const', 'Const': 'Const',
'QuantizedConv2D': 'QuantizedConv2d_8x8to32', 'QuantizedConv2D': 'QuantizedConv2d_8x8to32',
'QuantizedMatMul': 'QuantizedMatMul_8x8to32', 'QuantizedMatMul': 'QuantizedMatMul_8x8to32',
'QuantizeDownAndShrinkRange': 'QuantizeDownAndShrinkRange_32to8', 'QuantizeDownAndShrinkRange': 'QuantizeDownAndShrinkRange_32to8',
'QuantizedRelu': 'QuantizedRelu_8', 'QuantizedRelu': 'QuantizedRelu_8',
'QuantizedReluX': 'QuantizedReluX_8', 'QuantizedReluX': 'QuantizedReluX_8',
'QuantizedMaxPool': 'QuantizedMaxPool_8', 'QuantizedMaxPool': 'QuantizedMaxPool_8',
'QuantizedAvgPool': 'QuantizedAvgPool_8', 'QuantizedAvgPool': 'QuantizedAvgPool_8',
'QuantizedConcat': 'QuantizedConcat_8', 'QuantizedConcat': 'QuantizedConcat_8',
'QuantizedBiasAdd': 'QuantizedBiasAdd_8p8to32', 'QuantizedBiasAdd': 'QuantizedBiasAdd_8p8to32',
'QuantizedResizeBilinear' : 'QuantizedResizeBilinear_8', 'QuantizedResizeBilinear': 'QuantizedResizeBilinear_8',
'QuantizedSpaceToBatchND': 'QuantizedSpaceToBatchND_8', 'QuantizedSpaceToBatchND': 'QuantizedSpaceToBatchND_8',
'QuantizedBatchToSpaceND': 'QuantizedBatchToSpaceND_8', 'QuantizedBatchToSpaceND': 'QuantizedBatchToSpaceND_8',
'QuantizedSoftmax': 'QuantizedSoftmax_8', 'QuantizedSoftmax': 'QuantizedSoftmax_8',
'QuantizedTanh': 'QuantizedTanh_8', 'QuantizedTanh': 'QuantizedTanh_8',
'Min': 'Min_f', 'Min': 'Min_f',
'Max': 'Max_f', 'Max': 'Max_f',
'QuantizeV2': 'Quantize', 'QuantizeV2': 'Quantize',
'Dequantize': 'Dequantize', 'Dequantize': 'Dequantize',
'Softmax': 'Softmax_f', 'Softmax': 'Softmax_f',
'Reshape': 'Reshape', 'Reshape': 'Reshape',
'QuantizedReshape': 'QuantizedReshape', 'QuantizedReshape': 'QuantizedReshape',
'Sigmoid': 'Sigmoid_f', 'Sigmoid': 'Sigmoid_f',
'Slice': 'Slice_f', 'Slice': 'Slice_f',
'Add': 'Add_f', 'Add': 'Add_f',
'Mul': 'Mul_f', 'Mul': 'Mul_f',
'Requantize': 'Requantize_32to8', 'Requantize': 'Requantize_32to8',
'RequantizationRange': 'RequantizationRange_32', 'RequantizationRange': 'RequantizationRange_32',
'Sub': 'Sub_f', 'Sub': 'Sub_f',
'Pack': 'Pack_int32', 'Pack': 'Pack_int32',
'StridedSlice': 'StridedSlice_f', 'StridedSlice': 'StridedSlice_f',
'ExpandDims': 'ExpandDims_f', 'ExpandDims': 'ExpandDims_f',
'QuantizedMul': 'QuantizedMul_8x8to32', 'QuantizedMul': 'QuantizedMul_8x8to32',
'QuantizedAdd': 'QuantizedAdd_8p8to32', 'QuantizedAdd': 'QuantizedAdd_8p8to32',
'Pad': 'Pad_f', 'Pad': 'Pad_f',
'SpaceToBatchND': 'SpaceToBatchND_f', 'SpaceToBatchND': 'SpaceToBatchND_f',
'BatchToSpaceND': 'BatchToSpaceND_f', 'BatchToSpaceND': 'BatchToSpaceND_f',
'ResizeBilinear': 'ResizeBilinear_f', 'ResizeBilinear': 'ResizeBilinear_f',
'ConcatV2': 'ConcatV2_f', 'ConcatV2': 'ConcatV2_f',
'Conv2DBackpropInput': 'Deconv_f', 'Conv2DBackpropInput': 'Deconv_f',
'Tanh': 'Tanh_f', 'Tanh': 'Tanh_f',
'Split': 'Split_f', 'Split': 'Split_f',
'Transpose': 'Transpose_f', 'Transpose': 'Transpose_f',
'Concat': 'Concat_f', 'Concat': 'Concat_f',
'AddN': 'AddN_f', 'AddN': 'AddN_f',
} }
def has_op(self, tf_op):
return tf_op in self.dsp_ops
def map_nn_op(self, tf_op):
if tf_op not in self.dsp_ops:
raise Exception('Could not map nn op for: ', tf_op)
return self.dsp_ops[tf_op]
def has_op(self, tf_op):
return tf_op in self.dsp_ops
def map_nn_op(self, tf_op):
if tf_op not in self.dsp_ops:
raise Exception('Could not map nn op for: ', tf_op)
return self.dsp_ops[tf_op]
...@@ -4,77 +4,81 @@ import sys ...@@ -4,77 +4,81 @@ import sys
import jinja2 import jinja2
# python encrypt_opencl_codegen.py --cl_kernel_dir=./mace/kernels/opencl/cl/ \ # python encrypt_opencl_codegen.py --cl_kernel_dir=./mace/kernels/opencl/cl/ \
# --output_path=./mace/codegen/opencl_encrypt/opencl_encrypted_program.cc # --output_path=./mace/codegen/opencl_encrypt/opencl_encrypted_program.cc
FLAGS = None FLAGS = None
encrypt_lookup_table = "Xiaomi-AI-Platform-Mace" encrypt_lookup_table = "Xiaomi-AI-Platform-Mace"
def encrypt_code(code_str): def encrypt_code(code_str):
encrypted_arr = [] encrypted_arr = []
for i in range(len(code_str)): for i in range(len(code_str)):
encrypted_char = hex(ord(code_str[i]) ^ ord(encrypt_lookup_table[i % len(encrypt_lookup_table)])) encrypted_char = hex(
encrypted_arr.append(encrypted_char) ord(code_str[i]) ^ ord(
return encrypted_arr encrypt_lookup_table[i % len(encrypt_lookup_table)]))
encrypted_arr.append(encrypted_char)
return encrypted_arr
def main(unused_args): def main(unused_args):
if not os.path.exists(FLAGS.cl_kernel_dir): if not os.path.exists(FLAGS.cl_kernel_dir):
print("Input cl_kernel_dir " + FLAGS.cl_kernel_dir + " doesn't exist!") print("Input cl_kernel_dir " + FLAGS.cl_kernel_dir + " doesn't exist!")
header_code = "" header_code = ""
for file_name in os.listdir(FLAGS.cl_kernel_dir): for file_name in os.listdir(FLAGS.cl_kernel_dir):
file_path = os.path.join(FLAGS.cl_kernel_dir, file_name) file_path = os.path.join(FLAGS.cl_kernel_dir, file_name)
if file_path[-2:] == ".h": if file_path[-2:] == ".h":
f = open(file_path, "r") f = open(file_path, "r")
header_code += f.read() header_code += f.read()
encrypted_code_maps = {} encrypted_code_maps = {}
for file_name in os.listdir(FLAGS.cl_kernel_dir): for file_name in os.listdir(FLAGS.cl_kernel_dir):
file_path = os.path.join(FLAGS.cl_kernel_dir, file_name) file_path = os.path.join(FLAGS.cl_kernel_dir, file_name)
if file_path[-3:] == ".cl": if file_path[-3:] == ".cl":
f = open(file_path, "r") f = open(file_path, "r")
code_str = "" code_str = ""
for line in f.readlines(): for line in f.readlines():
if "#include <common.h>" in line: if "#include <common.h>" in line:
code_str += header_code code_str += header_code
else: else:
code_str += line code_str += line
encrypted_code_arr = encrypt_code(code_str) encrypted_code_arr = encrypt_code(code_str)
encrypted_code_maps[file_name[:-3]] = encrypted_code_arr encrypted_code_maps[file_name[:-3]] = encrypted_code_arr
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
cpp_cl_encrypted_kernel = env.get_template('str2vec_maps.cc.jinja2').render( cpp_cl_encrypted_kernel = env.get_template(
maps=encrypted_code_maps, 'str2vec_maps.cc.jinja2').render(
data_type='unsigned char', maps=encrypted_code_maps,
variable_name='kEncryptedProgramMap') data_type='unsigned char',
variable_name='kEncryptedProgramMap')
if os.path.isfile(FLAGS.output_path):
os.remove(FLAGS.output_path) if os.path.isfile(FLAGS.output_path):
w_file = open(FLAGS.output_path, "w") os.remove(FLAGS.output_path)
w_file.write(cpp_cl_encrypted_kernel) w_file = open(FLAGS.output_path, "w")
w_file.close() w_file.write(cpp_cl_encrypted_kernel)
w_file.close()
print("Generate encrypted opencl source done!")
print("Generate encrypted opencl source done!")
def parse_args(): def parse_args():
"""Parses command line arguments.""" """Parses command line arguments."""
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
"--cl_kernel_dir", "--cl_kernel_dir",
type=str, type=str,
default="./mace/kernels/opencl/cl/", default="./mace/kernels/opencl/cl/",
help="The cl kernels directory.") help="The cl kernels directory.")
parser.add_argument( parser.add_argument(
"--output_path", "--output_path",
type=str, type=str,
default="./mace/examples/codegen/opencl/opencl_encrypted_program.cc", default="./mace/examples/codegen/opencl/opencl_encrypted_program.cc",
help="The path of encrypted opencl kernels.") help="The path of encrypted opencl kernels.")
return parser.parse_known_args() return parser.parse_known_args()
if __name__ == '__main__': if __name__ == '__main__':
FLAGS, unparsed = parse_args() FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed) main(unused_args=[sys.argv[0]] + unparsed)
...@@ -2,18 +2,21 @@ import tensorflow as tf ...@@ -2,18 +2,21 @@ import tensorflow as tf
from mace.proto import mace_pb2 from mace.proto import mace_pb2
from collections import OrderedDict from collections import OrderedDict
def sort_tf_node(node, nodes_map, ordered_nodes_map): def sort_tf_node(node, nodes_map, ordered_nodes_map):
if node.name not in ordered_nodes_map: if node.name not in ordered_nodes_map:
for input_tensor_name in node.input: for input_tensor_name in node.input:
input_node_name = input_tensor_name.split(':')[ input_node_name = input_tensor_name.split(':')[
0] if ':' in input_tensor_name else input_tensor_name 0] if ':' in input_tensor_name else input_tensor_name
if input_node_name not in nodes_map or input_node_name in ordered_nodes_map: if input_node_name not in nodes_map or \
input_node_name in ordered_nodes_map:
continue continue
input_node = nodes_map[input_node_name] input_node = nodes_map[input_node_name]
sort_tf_node(input_node, nodes_map, ordered_nodes_map) sort_tf_node(input_node, nodes_map, ordered_nodes_map)
ordered_nodes_map[node.name] = node ordered_nodes_map[node.name] = node
def sort_tf_graph(graph_def): def sort_tf_graph(graph_def):
nodes_map = {} nodes_map = {}
ordered_nodes_map = OrderedDict() ordered_nodes_map = OrderedDict()
...@@ -31,13 +34,15 @@ def sort_mace_node(node, nodes_map, ordered_nodes_map): ...@@ -31,13 +34,15 @@ def sort_mace_node(node, nodes_map, ordered_nodes_map):
for input_tensor_name in node.input: for input_tensor_name in node.input:
input_node_name = input_tensor_name.split(':')[ input_node_name = input_tensor_name.split(':')[
0] if ':' in input_tensor_name else input_tensor_name 0] if ':' in input_tensor_name else input_tensor_name
if input_node_name not in nodes_map or input_node_name in ordered_nodes_map: if input_node_name not in nodes_map or \
input_node_name in ordered_nodes_map:
continue continue
input_node = nodes_map[input_node_name] input_node = nodes_map[input_node_name]
sort_mace_node(input_node, nodes_map, ordered_nodes_map) sort_mace_node(input_node, nodes_map, ordered_nodes_map)
ordered_nodes_map[node.name] = node ordered_nodes_map[node.name] = node
def sort_mace_graph(graph_def, output_name): def sort_mace_graph(graph_def, output_name):
nodes_map = {} nodes_map = {}
ordered_nodes_map = OrderedDict() ordered_nodes_map = OrderedDict()
......
...@@ -2,120 +2,131 @@ import sys ...@@ -2,120 +2,131 @@ import sys
import operator import operator
from mace.proto import mace_pb2 from mace.proto import mace_pb2
class MemoryOptimizer(object): class MemoryOptimizer(object):
def __init__(self, net_def): def __init__(self, net_def):
self.net_def = net_def self.net_def = net_def
self.idle_mem = set() self.idle_mem = set()
self.op_mem = {} # op_name->mem_id self.op_mem = {} # op_name->mem_id
self.mem_block = {} # mem_id->[x, y] self.mem_block = {} # mem_id->[x, y]
self.total_mem_count = 0 self.total_mem_count = 0
self.ref_counter = {} self.ref_counter = {}
consumers = {} consumers = {}
for op in net_def.op: for op in net_def.op:
if self.is_buffer_image_op(op): if self.is_buffer_image_op(op):
continue continue
for ipt in op.input: for ipt in op.input:
if ipt not in consumers: if ipt not in consumers:
consumers[ipt] = [] consumers[ipt] = []
consumers[ipt].append(op) consumers[ipt].append(op)
# only ref op's output tensor # only ref op's output tensor
for op in net_def.op: for op in net_def.op:
if self.is_buffer_image_op(op): if self.is_buffer_image_op(op):
continue continue
for output in op.output: for output in op.output:
tensor_name = output tensor_name = output
if tensor_name in consumers: if tensor_name in consumers:
self.ref_counter[tensor_name] = len(consumers[tensor_name]) self.ref_counter[tensor_name] = len(consumers[tensor_name])
else:
self.ref_counter[tensor_name] = 0
def is_buffer_image_op(self, op):
return op.type == 'BufferToImage' or op.type == 'ImageToBuffer'
def get_mem_size(self, op_type, output_shape):
mem_size = [0, 0]
if op_type == 'WinogradTransform' or op_type == 'MatMul':
mem_size[0] = output_shape[2] * output_shape[3]
mem_size[1] = output_shape[0] * int((output_shape[1] + 3) / 4)
else: else:
self.ref_counter[tensor_name] = 0 mem_size[0] = output_shape[2] * int((output_shape[3] + 3) / 4)
mem_size[1] = output_shape[0] * output_shape[1]
def is_buffer_image_op(self, op): return mem_size
return op.type == 'BufferToImage' or op.type == 'ImageToBuffer'
def mem_area(self, memory_size):
def get_mem_size(self, op_type, output_shape): return memory_size[0] * memory_size[1]
mem_size = [0, 0]
if op_type == 'WinogradTransform' or op_type == 'MatMul': def optimize(self):
mem_size[0] = output_shape[2] * output_shape[3] for op in self.net_def.op:
mem_size[1] = output_shape[0] * int((output_shape[1]+3)/4) if self.is_buffer_image_op(op):
else: continue
mem_size[0] = output_shape[2] * int((output_shape[3]+3)/4) if not op.output_shape:
mem_size[1] = output_shape[0] * output_shape[1] print('WARNING: There is no output shape information to '
return mem_size 'do memory optimization.')
return
def mem_area(self, memory_size): if len(op.output_shape) != len(op.output):
return memory_size[0] * memory_size[1] print('WARNING: the number of output shape is not equal to '
'the number of output.')
def optimize(self): return
for op in self.net_def.op: for i in range(len(op.output)):
if self.is_buffer_image_op(op): op_mem_size = self.get_mem_size(op.type,
continue op.output_shape[i].dims)
if not op.output_shape: mem_id = -1
print('WARNING: There is no output shape information to do memory optimization.') if len(self.idle_mem) > 0:
return best_mem_candidate_id = -1
if len(op.output_shape) != len(op.output): best_mem_candidate_delta_area = sys.maxint
print('WARNING: the number of output shape is not equal to the number of output.') best_mem_candidate_shape = []
return for mid in self.idle_mem:
for i in range(len(op.output)): reuse_mem_size = self.mem_block[mid]
op_mem_size = self.get_mem_size(op.type, op.output_shape[i].dims) resize_mem_size = [
mem_id = -1 max(reuse_mem_size[0], op_mem_size[0]),
if len(self.idle_mem) > 0: max(reuse_mem_size[1], op_mem_size[1])
best_mem_candidate_id = -1 ]
best_mem_candidate_delta_area = sys.maxint delta_mem_area = self.mem_area(
best_mem_candidate_shape = [] resize_mem_size) - self.mem_area(reuse_mem_size)
for mid in self.idle_mem: if delta_mem_area < best_mem_candidate_delta_area:
reuse_mem_size = self.mem_block[mid] best_mem_candidate_id = mid
resize_mem_size = [max(reuse_mem_size[0], op_mem_size[0]), max(reuse_mem_size[1], op_mem_size[1])] best_mem_candidate_delta_area = delta_mem_area
delta_mem_area = self.mem_area(resize_mem_size) - self.mem_area(reuse_mem_size) best_mem_candidate_shape = resize_mem_size
if delta_mem_area < best_mem_candidate_delta_area:
best_mem_candidate_id = mid if best_mem_candidate_delta_area <= self.mem_area(
best_mem_candidate_delta_area = delta_mem_area op_mem_size):
best_mem_candidate_shape = resize_mem_size # reuse
self.mem_block[
if best_mem_candidate_delta_area <= self.mem_area(op_mem_size): best_mem_candidate_id] = best_mem_candidate_shape
# reuse mem_id = best_mem_candidate_id
self.mem_block[best_mem_candidate_id] = best_mem_candidate_shape self.idle_mem.remove(mem_id)
mem_id = best_mem_candidate_id
self.idle_mem.remove(mem_id) if mem_id == -1:
mem_id = self.total_mem_count
if mem_id == -1: self.total_mem_count += 1
mem_id = self.total_mem_count self.mem_block[mem_id] = op_mem_size
self.total_mem_count += 1
self.mem_block[mem_id] = op_mem_size op.mem_id.extend([mem_id])
self.op_mem[op.output[i]] = mem_id
op.mem_id.extend([mem_id])
self.op_mem[op.output[i]] = mem_id # de-ref input tensor mem
for ipt in op.input:
# de-ref input tensor mem if ipt in self.ref_counter:
for ipt in op.input: self.ref_counter[ipt] -= 1
if ipt in self.ref_counter: if self.ref_counter[ipt] == 0:
self.ref_counter[ipt] -= 1 self.idle_mem.add(self.op_mem[ipt])
if self.ref_counter[ipt] == 0: elif self.ref_counter[ipt] < 0:
self.idle_mem.add(self.op_mem[ipt]) raise Exception('ref count is less than 0')
elif self.ref_counter[ipt] < 0:
raise Exception('ref count is less than 0') for mem in self.mem_block:
arena = self.net_def.mem_arena
for mem in self.mem_block: block = arena.mem_block.add()
arena = self.net_def.mem_arena block.mem_id = mem
block = arena.mem_block.add() block.x = self.mem_block[mem][0]
block.mem_id = mem block.y = self.mem_block[mem][1]
block.x = self.mem_block[mem][0]
block.y = self.mem_block[mem][1] print('total op: %d', len(self.net_def.op))
origin_mem_size = 0
print('total op: %d', len(self.net_def.op)) optimized_mem_size = 0
origin_mem_size = 0 for op in self.net_def.op:
optimized_mem_size = 0 if self.is_buffer_image_op(op):
for op in self.net_def.op: continue
if self.is_buffer_image_op(op): origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1)
continue for mem in self.mem_block:
origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1) print mem, self.mem_block[mem]
for mem in self.mem_block: optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4)
print mem, self.mem_block[mem]
optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4) print('origin mem: %d, optimized mem: %d', origin_mem_size,
optimized_mem_size)
print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)
def optimize_memory(net_def): def optimize_memory(net_def):
mem_optimizer = MemoryOptimizer(net_def) mem_optimizer = MemoryOptimizer(net_def)
mem_optimizer.optimize() mem_optimizer.optimize()
...@@ -14,86 +14,89 @@ FLAGS = None ...@@ -14,86 +14,89 @@ FLAGS = None
def generate_cpp_source(): def generate_cpp_source():
maps = {} maps = {}
platform_info = '' platform_info = ''
binary_dirs = FLAGS.cl_binary_dirs.strip().split(",") binary_dirs = FLAGS.cl_binary_dirs.strip().split(",")
for binary_dir in binary_dirs: for binary_dir in binary_dirs:
binary_path = os.path.join(binary_dir, FLAGS.built_kernel_file_name) binary_path = os.path.join(binary_dir, FLAGS.built_kernel_file_name)
if not os.path.exists(binary_path): if not os.path.exists(binary_path):
continue continue
print 'generate opencl code from', binary_path print 'generate opencl code from', binary_path
with open(binary_path, "rb") as f: with open(binary_path, "rb") as f:
binary_array = np.fromfile(f, dtype=np.uint8) binary_array = np.fromfile(f, dtype=np.uint8)
idx = 0 idx = 0
size, = struct.unpack("Q", binary_array[idx:idx+8]) size, = struct.unpack("Q", binary_array[idx:idx + 8])
idx += 8 idx += 8
for _ in xrange(size): for _ in xrange(size):
key_size, = struct.unpack("i", binary_array[idx:idx+4]) key_size, = struct.unpack("i", binary_array[idx:idx + 4])
idx += 4 idx += 4
key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size]) key, = struct.unpack(
idx += key_size str(key_size) + "s", binary_array[idx:idx + key_size])
value_size, = struct.unpack("i", binary_array[idx:idx+4]) idx += key_size
idx += 4 value_size, = struct.unpack("i", binary_array[idx:idx + 4])
maps[key] = [] idx += 4
value = struct.unpack(str(value_size) + "B", maps[key] = []
binary_array[idx:idx+value_size]) value = struct.unpack(
idx += value_size str(value_size) + "B", binary_array[idx:idx + value_size])
for ele in value: idx += value_size
maps[key].append(hex(ele)) for ele in value:
maps[key].append(hex(ele))
cl_platform_info_path = os.path.join(binary_dir, FLAGS.platform_info_file_name)
with open(cl_platform_info_path, 'r') as f: cl_platform_info_path = os.path.join(binary_dir,
curr_platform_info = f.read() FLAGS.platform_info_file_name)
if platform_info != "": with open(cl_platform_info_path, 'r') as f:
assert(curr_platform_info == platform_info) curr_platform_info = f.read()
platform_info = curr_platform_info if platform_info != "":
assert (curr_platform_info == platform_info)
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0])) platform_info = curr_platform_info
return env.get_template('opencl_compiled_kernel.cc.jinja2').render(
maps = maps, env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
data_type = 'unsigned char', return env.get_template('opencl_compiled_kernel.cc.jinja2').render(
variable_name = 'kCompiledProgramMap', maps=maps,
platform_info = platform_info, data_type='unsigned char',
) variable_name='kCompiledProgramMap',
platform_info=platform_info,
)
def main(unused_args): def main(unused_args):
cpp_cl_binary_source = generate_cpp_source() cpp_cl_binary_source = generate_cpp_source()
if os.path.isfile(FLAGS.output_path): if os.path.isfile(FLAGS.output_path):
os.remove(FLAGS.output_path) os.remove(FLAGS.output_path)
w_file = open(FLAGS.output_path, "w") w_file = open(FLAGS.output_path, "w")
w_file.write(cpp_cl_binary_source) w_file.write(cpp_cl_binary_source)
w_file.close() w_file.close()
def parse_args(): def parse_args():
"""Parses command line arguments.""" """Parses command line arguments."""
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
"--cl_binary_dirs", "--cl_binary_dirs",
type=str, type=str,
default="", default="",
help="The cl binaries directories.") help="The cl binaries directories.")
parser.add_argument( parser.add_argument(
"--built_kernel_file_name", "--built_kernel_file_name",
type=str, type=str,
default="", default="",
help="The cl binaries directories.") help="The cl binaries directories.")
parser.add_argument( parser.add_argument(
"--platform_info_file_name", "--platform_info_file_name",
type=str, type=str,
default="", default="",
help="The cl binaries directories.") help="The cl binaries directories.")
parser.add_argument( parser.add_argument(
"--output_path", "--output_path",
type=str, type=str,
default="./mace/examples/codegen/opencl/opencl_compiled_program.cc", default="./mace/examples/codegen/opencl/opencl_compiled_program.cc",
help="The path of generated C++ header file which contains cl binaries.") help="The path of generated C++ header file for cl binaries.")
return parser.parse_known_args() return parser.parse_known_args()
if __name__ == '__main__': if __name__ == '__main__':
FLAGS, unparsed = parse_args() FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed) main(unused_args=[sys.argv[0]] + unparsed)
...@@ -6,182 +6,196 @@ import hashlib ...@@ -6,182 +6,196 @@ import hashlib
from mace.proto import mace_pb2 from mace.proto import mace_pb2
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
GENERATED_NAME = set() GENERATED_NAME = set()
def generate_obfuscated_name(namespace, name): def generate_obfuscated_name(namespace, name):
md5 = hashlib.md5() md5 = hashlib.md5()
md5.update(namespace) md5.update(namespace)
md5.update(name) md5.update(name)
md5_digest = md5.hexdigest() md5_digest = md5.hexdigest()
name = md5_digest[:8] name = md5_digest[:8]
while name in GENERATED_NAME: while name in GENERATED_NAME:
name = md5_digest name = md5_digest
assert name not in GENERATED_NAME assert name not in GENERATED_NAME
GENERATED_NAME.add(name) GENERATED_NAME.add(name)
return name return name
def generate_tensor_map(tensors): def generate_tensor_map(tensors):
tensor_map = {} tensor_map = {}
for t in tensors: for t in tensors:
if not tensor_map.has_key(t.name): if t.name not in tensor_map:
tensor_map[t.name] = generate_obfuscated_name("tensor", t.name) tensor_map[t.name] = generate_obfuscated_name("tensor", t.name)
return tensor_map return tensor_map
def generate_in_out_map(ops, tensor_map): def generate_in_out_map(ops, tensor_map):
in_out_map = {} in_out_map = {}
for op in ops: for op in ops:
op.name = generate_obfuscated_name("op", op.name) op.name = generate_obfuscated_name("op", op.name)
for input_name in op.input: for input_name in op.input:
if not in_out_map.has_key(input_name): if input_name not in in_out_map:
if tensor_map.has_key(input_name): if input_name in tensor_map:
in_out_map[input_name] = tensor_map[input_name] in_out_map[input_name] = tensor_map[input_name]
else: else:
in_out_map[input_name] = generate_obfuscated_name("in", input_name) in_out_map[input_name] = generate_obfuscated_name(
for output_name in op.output: "in", input_name)
if not in_out_map.has_key(output_name): for output_name in op.output:
if tensor_map.has_key(output_name): if output_name not in in_out_map:
in_out_map[output_name] = tensor_map[output_name] if output_name in tensor_map:
else: in_out_map[output_name] = tensor_map[output_name]
in_out_map[output_name] = generate_obfuscated_name("out", output_name) else:
return in_out_map in_out_map[output_name] = generate_obfuscated_name(
"out", output_name)
return in_out_map
def obfuscate_name(net_def): def obfuscate_name(net_def):
input_node = "mace_input_node" input_node = "mace_input_node"
output_node = "mace_output_node" output_node = "mace_output_node"
tensor_map = generate_tensor_map(net_def.tensors) tensor_map = generate_tensor_map(net_def.tensors)
in_out_map = generate_in_out_map(net_def.op, tensor_map) in_out_map = generate_in_out_map(net_def.op, tensor_map)
for t in net_def.tensors: for t in net_def.tensors:
if input_node not in t.name and output_node not in t.name: if input_node not in t.name and output_node not in t.name:
t.name = tensor_map[t.name] t.name = tensor_map[t.name]
for op in net_def.op: for op in net_def.op:
for i in range(len(op.input)): for i in range(len(op.input)):
if input_node not in op.input[i]: if input_node not in op.input[i]:
op.input[i] = in_out_map[op.input[i]] op.input[i] = in_out_map[op.input[i]]
for i in range(len(op.output)): for i in range(len(op.output)):
if output_node not in op.output[i]: if output_node not in op.output[i]:
op.output[i] = in_out_map[op.output[i]] op.output[i] = in_out_map[op.output[i]]
def rename_tensor(net_def): def rename_tensor(net_def):
tensor_map = {} tensor_map = {}
for t in net_def.tensors: for t in net_def.tensors:
if not tensor_map.has_key(t.name): if t.name not in tensor_map:
tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_") tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_")
t.name = tensor_map[t.name] t.name = tensor_map[t.name]
for op in net_def.op: for op in net_def.op:
for i in range(len(op.input)): for i in range(len(op.input)):
if tensor_map.has_key(op.input[i]): if op.input[i] in tensor_map:
op.input[i] = tensor_map[op.input[i]] op.input[i] = tensor_map[op.input[i]]
for i in range(len(op.output)): for i in range(len(op.output)):
if tensor_map.has_key(op.output[i]): if op.output[i] in tensor_map:
op.output[i] = tensor_map[op.output[i]] op.output[i] = tensor_map[op.output[i]]
class TensorInfo: class TensorInfo:
def __init__(self, id, t, runtime): def __init__(self, id, t, runtime):
self.id = id self.id = id
self.data_type = mace_pb2.DataType.Name(t.data_type) self.data_type = mace_pb2.DataType.Name(t.data_type)
if t.data_type == mace_pb2.DT_FLOAT: if t.data_type == mace_pb2.DT_FLOAT:
if runtime == 'gpu': if runtime == 'gpu':
self.data_type = mace_pb2.DT_HALF self.data_type = mace_pb2.DT_HALF
self.data = bytearray(np.array(t.float_data).astype(np.float16).tobytes()) self.data = bytearray(
else: np.array(t.float_data).astype(np.float16).tobytes())
self.data_type = mace_pb2.DT_FLOAT else:
self.data = bytearray(np.array(t.float_data).astype(np.float32).tobytes()) self.data_type = mace_pb2.DT_FLOAT
elif t.data_type == mace_pb2.DT_INT32: self.data = bytearray(
self.data = bytearray(np.array(t.int32_data).astype(np.int32).tobytes()) np.array(t.float_data).astype(np.float32).tobytes())
elif t.data_type == mace_pb2.DT_UINT8: elif t.data_type == mace_pb2.DT_INT32:
self.data = bytearray(np.array(t.int32_data).astype(np.uint8).tolist()) self.data = bytearray(
np.array(t.int32_data).astype(np.int32).tobytes())
elif t.data_type == mace_pb2.DT_UINT8:
self.data = bytearray(
np.array(t.int32_data).astype(np.uint8).tolist())
def stringfy(value): def stringfy(value):
return ', '.join('"{0}"'.format(w) for w in value) return ', '.join('"{0}"'.format(w) for w in value)
def convert_to_source(net_def, mode_pb_checksum, template_dir, obfuscate, model_tag, output, runtime, embed_model_data):
if obfuscate: def convert_to_source(net_def, mode_pb_checksum, template_dir, obfuscate,
obfuscate_name(net_def) model_tag, output, runtime, embed_model_data):
else: if obfuscate:
rename_tensor(net_def) obfuscate_name(net_def)
else:
# Capture our current directory rename_tensor(net_def)
print template_dir
# Capture our current directory
# Create the jinja2 environment. print template_dir
j2_env = Environment(loader=FileSystemLoader(template_dir), trim_blocks=True)
j2_env.filters['stringfy'] = stringfy # Create the jinja2 environment.
output_dir = os.path.dirname(output) + '/' j2_env = Environment(
# generate tensor source files loader=FileSystemLoader(template_dir), trim_blocks=True)
template_name = 'tensor_source.jinja2' j2_env.filters['stringfy'] = stringfy
model_data = [] output_dir = os.path.dirname(output) + '/'
offset = 0 # generate tensor source files
counter = 0 template_name = 'tensor_source.jinja2'
for t in net_def.tensors: model_data = []
tensor_info = TensorInfo(counter, t, runtime) offset = 0
# align counter = 0
if tensor_info.data_type != 'DT_UINT8' and offset % 4 != 0: for t in net_def.tensors:
padding = 4 - offset % 4 tensor_info = TensorInfo(counter, t, runtime)
model_data.extend(bytearray([0] * padding)) # align
offset += padding if tensor_info.data_type != 'DT_UINT8' and offset % 4 != 0:
padding = 4 - offset % 4
model_data.extend(bytearray([0] * padding))
offset += padding
source = j2_env.get_template(template_name).render(
tensor_info=tensor_info,
tensor=t,
tag=model_tag,
runtime=runtime,
offset=offset,
)
model_data.extend(tensor_info.data)
offset += len(tensor_info.data)
with open(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f:
f.write(source)
counter += 1
# generate tensor data
template_name = 'tensor_data.jinja2'
source = j2_env.get_template(template_name).render( source = j2_env.get_template(template_name).render(
tensor_info = tensor_info, tag=model_tag,
tensor = t, embed_model_data=embed_model_data,
tag = model_tag, model_data_size=offset,
runtime = runtime, model_data=model_data)
offset = offset, with open(output_dir + 'tensor_data' + '.cc', "wb") as f:
) f.write(source)
model_data.extend(tensor_info.data) if not embed_model_data:
offset += len(tensor_info.data) f = open(output_dir + model_tag + '.data', "wb")
with open(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f: f.write(bytearray(model_data))
f.write(source) f.close()
counter += 1
# generate op source files
# generate tensor data template_name = 'operator.jinja2'
template_name = 'tensor_data.jinja2' counter = 0
source = j2_env.get_template(template_name).render( op_size = len(net_def.op)
tag = model_tag, for start in range(0, op_size, 10):
embed_model_data = embed_model_data, source = j2_env.get_template(template_name).render(
model_data_size = offset, start=start,
model_data = model_data end=min(start + 10, op_size),
) net=net_def,
with open(output_dir + 'tensor_data' + '.cc', "wb") as f: tag=model_tag,
f.write(source) runtime=runtime,
if not embed_model_data: )
f = open(output_dir + model_tag + '.data', "wb") with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f:
f.write(bytearray(model_data)) f.write(source)
f.close() counter += 1
# generate op source files # generate model source files
template_name = 'operator.jinja2' template_name = 'model.jinja2'
counter = 0 tensors = [
op_size = len(net_def.op) TensorInfo(i, net_def.tensors[i], runtime)
for start in range(0, op_size, 10): for i in range(len(net_def.tensors))
]
source = j2_env.get_template(template_name).render( source = j2_env.get_template(template_name).render(
start = start, tensors=tensors,
end = min(start+10, op_size), net=net_def,
net = net_def, tag=model_tag,
tag = model_tag, runtime=runtime,
runtime = runtime, model_pb_checksum=mode_pb_checksum)
) with open(output, "wb") as f:
with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f: f.write(source)
f.write(source)
counter += 1 # generate model header file
template_name = 'model_header.jinja2'
# generate model source files source = j2_env.get_template(template_name).render(tag=model_tag, )
template_name = 'model.jinja2' with open(output_dir + model_tag + '.h', "wb") as f:
tensors = [TensorInfo(i, net_def.tensors[i], runtime) for i in range(len(net_def.tensors))] f.write(source)
source = j2_env.get_template(template_name).render(
tensors = tensors,
net = net_def,
tag = model_tag,
runtime = runtime,
model_pb_checksum = mode_pb_checksum
)
with open(output, "wb") as f:
f.write(source)
# generate model header file
template_name = 'model_header.jinja2'
source = j2_env.get_template(template_name).render(
tag = model_tag,
)
with open(output_dir + model_tag + '.h', "wb") as f:
f.write(source)
...@@ -8,51 +8,41 @@ from mace.python.tools import memory_optimizer ...@@ -8,51 +8,41 @@ from mace.python.tools import memory_optimizer
from tensorflow.core.framework import graph_pb2 from tensorflow.core.framework import graph_pb2
from tensorflow.core.framework import tensor_shape_pb2 from tensorflow.core.framework import tensor_shape_pb2
padding_mode = { padding_mode = {'VALID': 0, 'SAME': 1, 'FULL': 2}
'VALID': 0, pooling_type_mode = {'AvgPool': 1, 'MaxPool': 2}
'SAME': 1,
'FULL': 2
}
pooling_type_mode = {
'AvgPool': 1,
'MaxPool': 2
}
# the order should be the same as # the order should be the same as
# eltwise type's in mace/kernels/eltwise.h # eltwise type's in mace/kernels/eltwise.h
# and also cwise type's in mace/kernels/cwise.h # and also cwise type's in mace/kernels/cwise.h
# cuz these math ops should have compatible with "EltWise" and "CWise" # cuz these math ops should have compatible with "EltWise" and "CWise"
math_type_mode = { math_type_mode = {
'MUL': 0, 'MUL': 0,
'ADD': 1, 'ADD': 1,
'MAX': 2, 'MAX': 2,
'MIN': 3, 'MIN': 3,
'SUB': 4, 'SUB': 4,
'DIV': 5, 'DIV': 5,
'NEG': 6, 'NEG': 6,
'ABS': 7 'ABS': 7
} }
buffer_type_map = { buffer_type_map = {
'CONV2D_FILTER' : 0, 'CONV2D_FILTER': 0,
'IN_OUT_CHANNEL' : 1, 'IN_OUT_CHANNEL': 1,
'ARGUMENT' : 2, 'ARGUMENT': 2,
'IN_OUT_HEIGHT' : 3, 'IN_OUT_HEIGHT': 3,
'IN_OUT_WIDTH' : 4, 'IN_OUT_WIDTH': 4,
'WINOGRAD_FILTER' : 5, 'WINOGRAD_FILTER': 5,
'DW_CONV2D_FILTER' : 6, 'DW_CONV2D_FILTER': 6,
} }
data_type_map = { data_type_map = {'DT_HALF': mace_pb2.DT_HALF, 'DT_FLOAT': mace_pb2.DT_FLOAT}
'DT_HALF' : mace_pb2.DT_HALF,
'DT_FLOAT': mace_pb2.DT_FLOAT
}
activation_name_map = { activation_name_map = {
'Relu' : 'RELU', 'Relu': 'RELU',
'Sigmoid' : 'SIGMOID', 'Sigmoid': 'SIGMOID',
'Tanh' : 'TANH', 'Tanh': 'TANH',
'Relu6' : 'RELUX' 'Relu6': 'RELUX'
} }
BATCH_NORM_ORDER = ["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"] BATCH_NORM_ORDER = ["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"]
...@@ -62,1123 +52,1170 @@ MACE_OUTPUT_NODE_NAME = "mace_output_node" ...@@ -62,1123 +52,1170 @@ MACE_OUTPUT_NODE_NAME = "mace_output_node"
OPENCL_IMAGE_MAX_SIZE = 16384 OPENCL_IMAGE_MAX_SIZE = 16384
def get_input_tensor(op, index): def get_input_tensor(op, index):
input_tensor = op.inputs[index] input_tensor = op.inputs[index]
if input_tensor.op.type == 'Reshape': if input_tensor.op.type == 'Reshape':
input_tensor = get_input_tensor(input_tensor.op, 0) input_tensor = get_input_tensor(input_tensor.op, 0)
return input_tensor return input_tensor
class TFConverter(object): class TFConverter(object):
def __init__(self, tf_ops, net_def, dt, device, winograd): def __init__(self, tf_ops, net_def, dt, device, winograd):
self.net_def = net_def self.net_def = net_def
self.tf_ops = tf_ops self.tf_ops = tf_ops
self.dt = dt self.dt = dt
self.device = device self.device = device
self.winograd = winograd self.winograd = winograd
self.tf_graph = {} self.tf_graph = {}
self.tf_parents = {} self.tf_parents = {}
self.resolved_ops = {} self.resolved_ops = {}
self.unused_tensor = set() self.unused_tensor = set()
self.transpose_filter_tensor = {} self.transpose_filter_tensor = {}
self.reshape_tensor = {} self.reshape_tensor = {}
self.ops = {} self.ops = {}
for op in tf_ops: for op in tf_ops:
self.ops[op.name] = op self.ops[op.name] = op
for op in tf_ops: for op in tf_ops:
self.resolved_ops[op.name] = 0 self.resolved_ops[op.name] = 0
for input in op.inputs: for input in op.inputs:
input_name = input.name[:-2] input_name = input.name[:-2]
if input_name not in self.tf_graph: if input_name not in self.tf_graph:
self.tf_graph[input_name] = [] self.tf_graph[input_name] = []
self.tf_graph[input_name].append(op) self.tf_graph[input_name].append(op)
if op.name not in self.tf_parents: if op.name not in self.tf_parents:
self.tf_parents[op.name] = [] self.tf_parents[op.name] = []
self.tf_parents[op.name].append(self.ops[input_name]) self.tf_parents[op.name].append(self.ops[input_name])
def add_buffer_to_image(self, input_name, input_type): def add_buffer_to_image(self, input_name, input_type):
output_name = input_name[:-2] + "_b2i" + input_name[-2:] output_name = input_name[:-2] + "_b2i" + input_name[-2:]
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
op_def.name = output_name[:-2] op_def.name = output_name[:-2]
op_def.type = 'BufferToImage' op_def.type = 'BufferToImage'
op_def.input.extend([input_name]) op_def.input.extend([input_name])
op_def.output.extend([output_name]) op_def.output.extend([output_name])
arg = op_def.arg.add() arg = op_def.arg.add()
arg.name = 'buffer_type' arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type] arg.i = buffer_type_map[input_type]
arg = op_def.arg.add() arg = op_def.arg.add()
arg.name = 'mode' arg.name = 'mode'
arg.i = 0 arg.i = 0
arg = op_def.arg.add() arg = op_def.arg.add()
arg.name = 'T' arg.name = 'T'
arg.i = self.dt arg.i = self.dt
return output_name return output_name
def add_image_to_buffer(self, input_name, input_type): def add_image_to_buffer(self, input_name, input_type):
output_name = input_name[:-2] + "_i2b" + input_name[-2:] output_name = input_name[:-2] + "_i2b" + input_name[-2:]
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
op_def.name = output_name[:-2] op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer' op_def.type = 'ImageToBuffer'
op_def.input.extend([input_name]) op_def.input.extend([input_name])
op_def.output.extend([output_name]) op_def.output.extend([output_name])
arg = op_def.arg.add() arg = op_def.arg.add()
arg.name = 'buffer_type' arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type] arg.i = buffer_type_map[input_type]
arg = op_def.arg.add() arg = op_def.arg.add()
arg.name = 'T' arg.name = 'T'
arg.i = self.dt arg.i = self.dt
return output_name return output_name
def add_gpu_input_transform(self, names): def add_gpu_input_transform(self, names):
for name in names: for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
op_def.name = name op_def.name = name
op_def.type = 'BufferToImage' op_def.type = 'BufferToImage'
op_def.input.extend([new_input_name]) op_def.input.extend([new_input_name])
op_def.output.extend([name+':0']) op_def.output.extend([name + ':0'])
epsilon_arg = op_def.arg.add() epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type' epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
arg = op_def.arg.add() arg = op_def.arg.add()
arg.name = 'T' arg.name = 'T'
arg.i = self.dt arg.i = self.dt
def add_neon_input_transform(self, names): def add_neon_input_transform(self, names):
for name in names: for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
op_def.name = name op_def.name = name
op_def.type = 'Transpose' op_def.type = 'Transpose'
op_def.input.extend([new_input_name]) op_def.input.extend([new_input_name])
op_def.output.extend([name+':0']) op_def.output.extend([name + ':0'])
dims_arg = op_def.arg.add() dims_arg = op_def.arg.add()
dims_arg.name = 'dims' dims_arg.name = 'dims'
dims_arg.ints.extend([0, 3, 1, 2]) dims_arg.ints.extend([0, 3, 1, 2])
arg = op_def.arg.add() arg = op_def.arg.add()
arg.name = 'T' arg.name = 'T'
arg.i = self.dt arg.i = self.dt
def add_gpu_output_transform(self, names): def add_gpu_output_transform(self, names):
for name in names: for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
op_def.name = output_name[:-2] op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer' op_def.type = 'ImageToBuffer'
op_def.input.extend([name+':0']) op_def.input.extend([name + ':0'])
op_def.output.extend([output_name]) op_def.output.extend([output_name])
epsilon_arg = op_def.arg.add() epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type' epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
def add_neon_output_transform(self, names): def add_neon_output_transform(self, names):
for name in names: for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add() op_def = self.net_def.op.add()
op_def.name = output_name[:-2] op_def.name = output_name[:-2]
op_def.type = 'Transpose' op_def.type = 'Transpose'
op_def.input.extend([name+':0']) op_def.input.extend([name + ':0'])
op_def.output.extend([output_name]) op_def.output.extend([output_name])
dims_arg = op_def.arg.add() dims_arg = op_def.arg.add()
dims_arg.name = 'dims' dims_arg.name = 'dims'
dims_arg.ints.extend([0, 2, 3, 1]) dims_arg.ints.extend([0, 2, 3, 1])
@staticmethod @staticmethod
def add_output_shape(outputs, op): def add_output_shape(outputs, op):
output_shapes = [] output_shapes = []
for output in outputs: for output in outputs:
output_shape = mace_pb2.OutputShape() output_shape = mace_pb2.OutputShape()
if isinstance(output, list): if isinstance(output, list):
output_shape.dims.extend(output) output_shape.dims.extend(output)
elif isinstance(output, tf.Tensor): elif isinstance(output, tf.Tensor):
if output.shape.num_elements() is not None: if output.shape.num_elements() is not None:
output_shape.dims.extend(output.shape.as_list()) output_shape.dims.extend(output.shape.as_list())
else: else:
raise ValueError('output type not supported: ', type(output)) raise ValueError('output type not supported: ', type(output))
output_shapes.append(output_shape) output_shapes.append(output_shape)
op.output_shape.extend(output_shapes) op.output_shape.extend(output_shapes)
def add_tensor(self, name, shape, tf_dt, value): def add_tensor(self, name, shape, tf_dt, value):
tensor = self.net_def.tensors.add() tensor = self.net_def.tensors.add()
tensor.name = name tensor.name = name
shape = list(shape) shape = list(shape)
tensor.dims.extend(shape) tensor.dims.extend(shape)
if tf_dt == tf.float32: if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(value.flat) tensor.float_data.extend(value.flat)
elif tf_dt == tf.int32: elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32 tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(value.flat) tensor.int32_data.extend(value.flat)
else: else:
raise Exception("Not supported tensor type: " + tf_dt.name) raise Exception("Not supported tensor type: " + tf_dt.name)
def convert_reshape(self, op): def convert_reshape(self, op):
input_tensor = get_input_tensor(op, 0) input_tensor = get_input_tensor(op, 0)
shape_tensor = get_input_tensor(op, 1) shape_tensor = get_input_tensor(op, 1)
shape_value = shape_tensor.eval().astype(np.int32) shape_value = shape_tensor.eval().astype(np.int32)
self.unused_tensor.add(shape_tensor.name) self.unused_tensor.add(shape_tensor.name)
self.reshape_tensor[input_tensor.name] = shape_value self.reshape_tensor[input_tensor.name] = shape_value
self.resolved_ops[op.name] = 1 self.resolved_ops[op.name] = 1
def convert_tensor(self, op): def convert_tensor(self, op):
output_name = op.outputs[0].name output_name = op.outputs[0].name
if output_name not in self.unused_tensor: if output_name not in self.unused_tensor:
tensor = self.net_def.tensors.add() tensor = self.net_def.tensors.add()
tf_tensor = op.outputs[0].eval() tf_tensor = op.outputs[0].eval()
if output_name in self.transpose_filter_tensor: if output_name in self.transpose_filter_tensor:
tf_tensor = tf_tensor.transpose(self.transpose_filter_tensor[output_name]) tf_tensor = tf_tensor.transpose(
if output_name in self.reshape_tensor: self.transpose_filter_tensor[output_name])
tf_tensor = tf_tensor.reshape(self.reshape_tensor[output_name]) if output_name in self.reshape_tensor:
tensor.name = op.outputs[0].name tf_tensor = tf_tensor.reshape(self.reshape_tensor[output_name])
tensor.name = op.outputs[0].name
shape = list(tf_tensor.shape)
tensor.dims.extend(shape) shape = list(tf_tensor.shape)
tensor.dims.extend(shape)
tf_dt = op.get_attr('dtype')
if tf_dt == tf.float32: tf_dt = op.get_attr('dtype')
tensor.data_type = mace_pb2.DT_FLOAT if tf_dt == tf.float32:
tensor.float_data.extend(tf_tensor.astype(np.float32).flat) tensor.data_type = mace_pb2.DT_FLOAT
elif tf_dt == tf.int32: tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
tensor.data_type = mace_pb2.DT_INT32 elif tf_dt == tf.int32:
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat) tensor.data_type = mace_pb2.DT_INT32
else: tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
raise Exception("Not supported tensor type: " + tf_dt.name) else:
self.resolved_ops[op.name] = 1 raise Exception("Not supported tensor type: " + tf_dt.name)
self.resolved_ops[op.name] = 1
def check_winograd_conv(self, op):
filter_shape = get_input_tensor(op, 1).shape.as_list() def check_winograd_conv(self, op):
strides = op.get_attr('strides')[1:3] filter_shape = get_input_tensor(op, 1).shape.as_list()
output_shape = op.outputs[0].shape.as_list() strides = op.get_attr('strides')[1:3]
if len(output_shape) == 0 or output_shape[0] is None: output_shape = op.outputs[0].shape.as_list()
return False if len(output_shape) == 0 or output_shape[0] is None:
width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) return False
return self.winograd and op.type != 'DepthwiseConv2dNative' and self.device == 'gpu' and \ width = output_shape[0] * ((output_shape[1] + 1) / 2) * ((
filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \ output_shape[2] + 1) / 2)
return self.winograd and op.type != 'DepthwiseConv2dNative' and \
self.device == 'gpu' and filter_shape[0] == 3 and \
(filter_shape[0] == filter_shape[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]) and \ (strides[0] == 1) and (strides[0] == strides[1]) and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE) (width < OPENCL_IMAGE_MAX_SIZE)
def convert_winograd_conv(self, op): def convert_winograd_conv(self, op):
filter_tensor = get_input_tensor(op, 1) filter_tensor = get_input_tensor(op, 1)
filter_shape = filter_tensor.shape.as_list() filter_shape = filter_tensor.shape.as_list()
output_shape = op.outputs[0].shape.as_list() output_shape = op.outputs[0].shape.as_list()
self.transpose_filter_tensor[filter_tensor.name] = (3, 2, 0, 1) self.transpose_filter_tensor[filter_tensor.name] = (3, 2, 0, 1)
filter_name = self.add_buffer_to_image(op.inputs[1].name, "WINOGRAD_FILTER") filter_name = self.add_buffer_to_image(op.inputs[1].name,
"WINOGRAD_FILTER")
# Input transform
wt_op = mace_pb2.OperatorDef() # Input transform
arg = wt_op.arg.add() wt_op = mace_pb2.OperatorDef()
arg.name = 'T' arg = wt_op.arg.add()
arg.i = self.dt arg.name = 'T'
padding_arg = wt_op.arg.add() arg.i = self.dt
padding_arg.name = 'padding' padding_arg = wt_op.arg.add()
padding_arg.i = padding_mode[op.get_attr('padding')] padding_arg.name = 'padding'
wt_op.name = op.name + '_input_transform' padding_arg.i = padding_mode[op.get_attr('padding')]
wt_op.type = 'WinogradTransform' wt_op.name = op.name + '_input_transform'
wt_op.input.extend([op.inputs[0].name]) wt_op.type = 'WinogradTransform'
wt_output_name = wt_op.name + ":0" wt_op.input.extend([op.inputs[0].name])
wt_op.output.extend([wt_output_name]) wt_output_name = wt_op.name + ":0"
wt_output_shape = mace_pb2.OutputShape() wt_op.output.extend([wt_output_name])
wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2) wt_output_shape = mace_pb2.OutputShape()
wt_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1]) wt_output_width = output_shape[0] * ((output_shape[1] + 1) / 2) * ((
wt_op.output_shape.extend([wt_output_shape]) output_shape[2] + 1) / 2)
wt_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
# MatMul wt_op.output_shape.extend([wt_output_shape])
matmul_op = mace_pb2.OperatorDef()
arg = matmul_op.arg.add() # MatMul
arg.name = 'T' matmul_op = mace_pb2.OperatorDef()
arg.i = self.dt arg = matmul_op.arg.add()
matmul_op.name = op.name + '_matmul' arg.name = 'T'
matmul_op.type = 'MatMul' arg.i = self.dt
matmul_op.input.extend([filter_name, wt_output_name]) matmul_op.name = op.name + '_matmul'
matmul_output_name = matmul_op.name + ":0" matmul_op.type = 'MatMul'
matmul_op.output.extend([matmul_output_name]) matmul_op.input.extend([filter_name, wt_output_name])
matmul_output_shape = mace_pb2.OutputShape() matmul_output_name = matmul_op.name + ":0"
matmul_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1]) matmul_op.output.extend([matmul_output_name])
matmul_op.output_shape.extend([matmul_output_shape]) matmul_output_shape = mace_pb2.OutputShape()
matmul_output_shape.dims.extend(
# Inverse transform [16, filter_shape[3], wt_output_width, 1])
iwt_op = mace_pb2.OperatorDef() matmul_op.output_shape.extend([matmul_output_shape])
arg = iwt_op.arg.add()
arg.name = 'T' # Inverse transform
arg.i = self.dt iwt_op = mace_pb2.OperatorDef()
batch_arg = iwt_op.arg.add() arg = iwt_op.arg.add()
batch_arg.name = 'batch' arg.name = 'T'
batch_arg.i = output_shape[0] arg.i = self.dt
height_arg = iwt_op.arg.add() batch_arg = iwt_op.arg.add()
height_arg.name = 'height' batch_arg.name = 'batch'
height_arg.i = output_shape[1] batch_arg.i = output_shape[0]
width_arg = iwt_op.arg.add() height_arg = iwt_op.arg.add()
width_arg.name = 'width' height_arg.name = 'height'
width_arg.i = output_shape[2] height_arg.i = output_shape[1]
iwt_op.name = op.name + '_inverse_transform' width_arg = iwt_op.arg.add()
iwt_op.type = 'WinogradInverseTransform' width_arg.name = 'width'
iwt_op.input.extend([matmul_output_name]) width_arg.i = output_shape[2]
iwt_op.name = op.name + '_inverse_transform'
final_op = op iwt_op.type = 'WinogradInverseTransform'
self.resolved_ops[op.name] = 1 iwt_op.input.extend([matmul_output_name])
if len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd' : final_op = op
bias_add_op = self.tf_graph[op.name][0] self.resolved_ops[op.name] = 1
output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
iwt_op.input.extend([output_name]) if len(self.tf_graph[op.name]
final_op = bias_add_op ) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd':
self.resolved_ops[bias_add_op.name] = 1 bias_add_op = self.tf_graph[op.name][0]
output_name = self.add_buffer_to_image(
if len(self.tf_graph[final_op.name]) == 1 \ get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
and self.tf_graph[final_op.name][0].type in activation_name_map: iwt_op.input.extend([output_name])
activation_op = self.tf_graph[final_op.name][0] final_op = bias_add_op
fused_act_arg = iwt_op.arg.add() self.resolved_ops[bias_add_op.name] = 1
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type] if len(self.tf_graph[final_op.name]) == 1 and \
if activation_op.type == 'Relu6': self.tf_graph[final_op.name][0].type in activation_name_map:
max_limit_arg = iwt_op.arg.add() activation_op = self.tf_graph[final_op.name][0]
max_limit_arg.name = 'max_limit' fused_act_arg = iwt_op.arg.add()
max_limit_arg.f = 6 fused_act_arg.name = 'activation'
final_op = activation_op fused_act_arg.s = activation_name_map[activation_op.type]
self.resolved_ops[activation_op.name] = 1 if activation_op.type == 'Relu6':
max_limit_arg = iwt_op.arg.add()
iwt_op.output.extend([output.name for output in final_op.outputs]) max_limit_arg.name = 'max_limit'
self.add_output_shape(final_op.outputs, iwt_op) max_limit_arg.f = 6
self.net_def.op.extend([wt_op, matmul_op, iwt_op]) final_op = activation_op
self.resolved_ops[activation_op.name] = 1
def convert_conv2d(self, op): iwt_op.output.extend([output.name for output in final_op.outputs])
op_def = mace_pb2.OperatorDef() self.add_output_shape(final_op.outputs, iwt_op)
arg = op_def.arg.add() self.net_def.op.extend([wt_op, matmul_op, iwt_op])
arg.name = 'T'
arg.i = self.dt def convert_conv2d(self, op):
op_def.name = op.name op_def = mace_pb2.OperatorDef()
if op.type == 'DepthwiseConv2dNative': arg = op_def.arg.add()
op_def.type = 'DepthwiseConv2d' arg.name = 'T'
if self.device == 'neon': arg.i = self.dt
self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (3, 2, 0, 1) op_def.name = op.name
else: if op.type == 'DepthwiseConv2dNative':
op_def.type = op.type op_def.type = 'DepthwiseConv2d'
if self.device == 'neon': if self.device == 'neon':
self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (3, 2, 0, 1) self.transpose_filter_tensor[get_input_tensor(
else: op, 1).name] = (3, 2, 0, 1)
self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (0, 1, 3, 2) else:
if self.device == 'gpu': op_def.type = op.type
op_def.input.extend([op.inputs[0].name]) if self.device == 'neon':
buffer_type = "DW_CONV2D_FILTER" if op_def.type == 'DepthwiseConv2d' else "CONV2D_FILTER" self.transpose_filter_tensor[get_input_tensor(
output_name = self.add_buffer_to_image(get_input_tensor(op, 1).name, buffer_type) op, 1).name] = (3, 2, 0, 1)
op_def.input.extend([output_name]) else:
else: self.transpose_filter_tensor[get_input_tensor(
op_def.input.extend([get_input_tensor(op, i).name for i in range(len(op.inputs))]) op, 1).name] = (0, 1, 3, 2)
if self.device == 'gpu':
padding_arg = op_def.arg.add() op_def.input.extend([op.inputs[0].name])
padding_arg.name = 'padding' if op_def.type == 'DepthwiseConv2d':
padding_arg.i = padding_mode[op.get_attr('padding')] buffer_type = "DW_CONV2D_FILTER"
strides_arg = op_def.arg.add() else:
strides_arg.name = 'strides' buffer_type = "CONV2D_FILTER"
strides_arg.ints.extend(op.get_attr('strides')[1:3]) output_name = self.add_buffer_to_image(
data_format_arg = op_def.arg.add() get_input_tensor(op, 1).name, buffer_type)
data_format_arg.name = 'data_format' op_def.input.extend([output_name])
if self.device == 'neon': else:
data_format_arg.s = 'NCHW' op_def.input.extend(
else: [get_input_tensor(op, i).name for i in range(len(op.inputs))])
data_format_arg.s = 'NHWC'
final_op = op padding_arg = op_def.arg.add()
self.resolved_ops[op.name] = 1 padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
if len(self.tf_graph.get(op.name, [])) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd': strides_arg = op_def.arg.add()
bias_add_op = self.tf_graph[op.name][0] strides_arg.name = 'strides'
if self.device == 'gpu': strides_arg.ints.extend(op.get_attr('strides')[1:3])
output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT") data_format_arg = op_def.arg.add()
op_def.input.extend([output_name]) data_format_arg.name = 'data_format'
else: if self.device == 'neon':
op_def.input.extend([get_input_tensor(bias_add_op, 1).name]) data_format_arg.s = 'NCHW'
final_op = bias_add_op else:
self.resolved_ops[bias_add_op.name] = 1 data_format_arg.s = 'NHWC'
final_op = op
if len(self.tf_graph.get(final_op.name, [])) == 1 \ self.resolved_ops[op.name] = 1
and self.tf_graph[final_op.name][0].type in activation_name_map:
activation_op = self.tf_graph[final_op.name][0] if len(self.tf_graph.get(op.name, [])) == 1 and \
if op_def.type == "Conv2D": self.tf_graph[op.name][0].type == 'BiasAdd':
op_def.type = "FusedConv2D" bias_add_op = self.tf_graph[op.name][0]
fused_act_arg = op_def.arg.add() if self.device == 'gpu':
fused_act_arg.name = 'activation' output_name = self.add_buffer_to_image(
fused_act_arg.s = activation_name_map[activation_op.type] get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
if activation_op.type == 'Relu6': op_def.input.extend([output_name])
max_limit_arg = op_def.arg.add() else:
max_limit_arg.name = 'max_limit' op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
max_limit_arg.f = 6 final_op = bias_add_op
final_op = activation_op self.resolved_ops[bias_add_op.name] = 1
self.resolved_ops[activation_op.name] = 1
if len(self.tf_graph.get(final_op.name, [])) == 1 and \
op_def.output.extend([output.name for output in final_op.outputs]) self.tf_graph[final_op.name][0].type in activation_name_map:
self.add_output_shape(final_op.outputs, op_def) activation_op = self.tf_graph[final_op.name][0]
self.net_def.op.extend([op_def]) if op_def.type == "Conv2D":
op_def.type = "FusedConv2D"
def convert_fused_batchnorm(self, op): fused_act_arg = op_def.arg.add()
op_def = mace_pb2.OperatorDef() fused_act_arg.name = 'activation'
arg = op_def.arg.add() fused_act_arg.s = activation_name_map[activation_op.type]
arg.name = 'T' if activation_op.type == 'Relu6':
arg.i = self.dt max_limit_arg = op_def.arg.add()
data_format_arg = op_def.arg.add() max_limit_arg.name = 'max_limit'
data_format_arg.name = 'data_format' max_limit_arg.f = 6
if self.device == 'neon': final_op = activation_op
data_format_arg.s = 'NCHW' self.resolved_ops[activation_op.name] = 1
else:
data_format_arg.s = 'NHWC' op_def.output.extend([output.name for output in final_op.outputs])
op_def.name = op.name self.add_output_shape(final_op.outputs, op_def)
op_def.type = 'FoldedBatchNorm' self.net_def.op.extend([op_def])
gamma_tensor = get_input_tensor(op, 1) def convert_fused_batchnorm(self, op):
for i in range(1, 5): op_def = mace_pb2.OperatorDef()
input_tensor = get_input_tensor(op, i) arg = op_def.arg.add()
assert input_tensor.shape == gamma_tensor.shape arg.name = 'T'
self.unused_tensor.add(input_tensor.name) arg.i = self.dt
data_format_arg = op_def.arg.add()
gamma_value = get_input_tensor(op, 1).eval().astype(np.float32) data_format_arg.name = 'data_format'
beta_value = get_input_tensor(op, 2).eval().astype(np.float32) if self.device == 'neon':
mean_value = get_input_tensor(op, 3).eval().astype(np.float32) data_format_arg.s = 'NCHW'
var_value = get_input_tensor(op, 4).eval().astype(np.float32) else:
epsilon_value = op.get_attr('epsilon') data_format_arg.s = 'NHWC'
op_def.name = op.name
scale_value = ( op_def.type = 'FoldedBatchNorm'
(1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
gamma_value) gamma_tensor = get_input_tensor(op, 1)
offset_value = (-mean_value * scale_value) + beta_value for i in range(1, 5):
idx = gamma_tensor.name.rfind('/') input_tensor = get_input_tensor(op, i)
name_prefix = gamma_tensor.name[:idx] + '/' assert input_tensor.shape == gamma_tensor.shape
input_names = [name_prefix+'scale:0', name_prefix+'offset:0'] self.unused_tensor.add(input_tensor.name)
self.add_tensor(input_names[0], gamma_value.shape,
gamma_tensor.dtype, scale_value) gamma_value = get_input_tensor(op, 1).eval().astype(np.float32)
self.add_tensor(input_names[1], gamma_value.shape, beta_value = get_input_tensor(op, 2).eval().astype(np.float32)
gamma_tensor.dtype, offset_value) mean_value = get_input_tensor(op, 3).eval().astype(np.float32)
var_value = get_input_tensor(op, 4).eval().astype(np.float32)
op_def.input.extend([op.inputs[0].name]) epsilon_value = op.get_attr('epsilon')
if self.device == 'gpu':
for name in input_names: scale_value = ((1.0 / np.vectorize(math.sqrt)
output_name = self.add_buffer_to_image(name, "ARGUMENT") (var_value + epsilon_value)) * gamma_value)
op_def.input.extend([output_name]) offset_value = (-mean_value * scale_value) + beta_value
else: idx = gamma_tensor.name.rfind('/')
op_def.input.extend([name for name in input_names]) name_prefix = gamma_tensor.name[:idx] + '/'
input_names = [name_prefix + 'scale:0', name_prefix + 'offset:0']
self.resolved_ops[op.name] = 1 self.add_tensor(input_names[0], gamma_value.shape, gamma_tensor.dtype,
final_op = op scale_value)
self.add_tensor(input_names[1], gamma_value.shape, gamma_tensor.dtype,
if len(self.tf_graph[op.name]) == 1 \ offset_value)
and self.tf_graph[op.name][0].type in activation_name_map:
activation_op = self.tf_graph[op.name][0] op_def.input.extend([op.inputs[0].name])
fused_act_arg = op_def.arg.add() if self.device == 'gpu':
fused_act_arg.name = 'activation' for name in input_names:
fused_act_arg.s = activation_name_map[activation_op.type] output_name = self.add_buffer_to_image(name, "ARGUMENT")
if activation_op.type == 'Relu6': op_def.input.extend([output_name])
else:
op_def.input.extend([name for name in input_names])
self.resolved_ops[op.name] = 1
final_op = op
if len(self.tf_graph[op.name]) == 1 \
and self.tf_graph[op.name][0].type in activation_name_map:
activation_op = self.tf_graph[op.name][0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
op_def.output.extend([final_op.outputs[0].name])
self.add_output_shape([final_op.outputs[0]], op_def)
self.net_def.op.extend([op_def])
def convert_batchnorm(self, op):
bn_ops = []
bn_ops.append(op)
for i in range(1, 3):
if len(self.tf_graph[bn_ops[i-1].name]) == 1 and \
self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]:
bn_ops.append(self.tf_graph[bn_ops[i - 1].name][0])
else:
raise Exception('Invalid BatchNorm Op')
if len(self.tf_graph[bn_ops[2].name]) == 2 and \
self.tf_graph[bn_ops[2].name][0].type == \
BATCH_NORM_ORDER[3] and \
self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]:
bn_ops.append(self.tf_graph[bn_ops[2].name][0])
bn_ops.append(self.tf_graph[bn_ops[2].name][1])
else:
raise Exception('Invalid BatchNorm Op')
bn_ops.append(self.tf_graph[bn_ops[4].name][0])
bn_ops.append(self.tf_graph[bn_ops[3].name][0])
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
input_name = get_input_tensor(bn_ops[3], 0).name
gamma = get_input_tensor(bn_ops[2], 1).name
beta = get_input_tensor(bn_ops[5], 0).name
mean = get_input_tensor(bn_ops[4], 0).name
variance = get_input_tensor(bn_ops[0], 0).name
op_def.name = op.name[:-4] # remove /add
op_def.type = 'BatchNorm'
if self.device == 'gpu':
op_def.input.extend([input_name])
for tensor_name in [gamma, beta, mean, variance]:
output_name = self.add_buffer_to_image(tensor_name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([input_name, gamma, beta, mean, variance])
op_def.output.extend([output.name for output in bn_ops[6].outputs])
self.add_output_shape(bn_ops[6].outputs, op_def)
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'epsilon'
epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float)
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.net_def.op.extend([op_def])
for i in range(0, 7):
self.resolved_ops[bn_ops[i].name] = 1
def convert_pooling(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Pooling'
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[op.type]
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(op.get_attr('strides')[1:3])
kernels_arg = op_def.arg.add()
kernels_arg.name = 'kernels'
kernels_arg.ints.extend(op.get_attr('ksize')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
def convert_global_avg_pooling(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Pooling'
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode['AvgPool']
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode['VALID']
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend([1, 1])
kernels_arg = op_def.arg.add()
kernels_arg.name = 'kernels'
kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
def convert_activation(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Activation'
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = activation_name_map[op.type]
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_relu6(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Activation'
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = "RELUX"
max_limit_arg = op_def.arg.add() max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit' max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6 max_limit_arg.f = 6
final_op = activation_op self.resolved_ops[op.name] = 1
self.resolved_ops[activation_op.name] = 1
def convert_add(self, op):
op_def.output.extend([final_op.outputs[0].name]) op_def = self.net_def.op.add()
self.add_output_shape([final_op.outputs[0]], op_def) arg = op_def.arg.add()
arg.name = 'T'
self.net_def.op.extend([op_def]) arg.i = self.dt
op_def.name = op.name
def convert_batchnorm(self, op): op_def.type = "AddN"
bn_ops = []
bn_ops.append(op)
for i in range(1, 3):
if len(self.tf_graph[bn_ops[i-1].name]) == 1 \
and self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]:
bn_ops.append(self.tf_graph[bn_ops[i-1].name][0])
else:
raise Exception('Invalid BatchNorm Op')
if len(self.tf_graph[bn_ops[2].name]) == 2 \
and self.tf_graph[bn_ops[2].name][0].type == BATCH_NORM_ORDER[3] \
and self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]:
bn_ops.append(self.tf_graph[bn_ops[2].name][0])
bn_ops.append(self.tf_graph[bn_ops[2].name][1])
else:
raise Exception('Invalid BatchNorm Op')
bn_ops.append(self.tf_graph[bn_ops[4].name][0])
bn_ops.append(self.tf_graph[bn_ops[3].name][0])
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
input_name = get_input_tensor(bn_ops[3], 0).name
gamma = get_input_tensor(bn_ops[2], 1).name
beta = get_input_tensor(bn_ops[5], 0).name
mean = get_input_tensor(bn_ops[4], 0).name
variance = get_input_tensor(bn_ops[0], 0).name
op_def.name = op.name[:-4] # remove /add
op_def.type = 'BatchNorm'
if self.device == 'gpu':
op_def.input.extend([input_name])
for tensor_name in [gamma, beta, mean, variance]:
output_name = self.add_buffer_to_image(tensor_name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([input_name, gamma, beta, mean, variance])
op_def.output.extend([output.name for output in bn_ops[6].outputs])
self.add_output_shape(bn_ops[6].outputs, op_def)
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'epsilon'
epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float)
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.net_def.op.extend([op_def])
for i in range(0, 7):
self.resolved_ops[bn_ops[i].name] = 1
def convert_pooling(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Pooling'
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[op.type]
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(op.get_attr('strides')[1:3])
kernels_arg = op_def.arg.add()
kernels_arg.name = 'kernels'
kernels_arg.ints.extend(op.get_attr('ksize')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
def convert_global_avg_pooling(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Pooling'
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode['AvgPool']
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode['VALID']
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend([1, 1])
kernels_arg = op_def.arg.add()
kernels_arg.name = 'kernels'
kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
def convert_activation(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Activation'
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = activation_name_map[op.type]
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_relu6(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Activation'
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = "RELUX"
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
self.resolved_ops[op.name] = 1
def convert_add(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "AddN"
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_concat(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "Concat"
op_def.input.extend([input.name for input in op.inputs[:-1]])
op_def.output.extend([output.name for output in op.outputs])
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis = get_input_tensor(op, len(op.inputs) - 1).eval().astype(np.int32)
if self.device == 'neon' and axis == 3:
axis = 1
axis_arg.i = axis
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, len(op.inputs) - 1).name)
def convert_resize_bilinear(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "ResizeBilinear"
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'size'
size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add()
size_arg.name = 'align_corners'
size_arg.i = op.get_attr('align_corners')
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 1).name)
def convert_math(self, op, math_type):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
if len(op.inputs) == 1:
op_def.type = "CWise"
op_def.input.extend([input.name for input in op.inputs])
x_arg = op_def.arg.add()
x_arg.name = 'x'
x_arg.f = 0
elif len(op.inputs) >= 2:
input_tensor0 = get_input_tensor(op, 0)
input_tensor1 = get_input_tensor(op, 1)
if input_tensor0.shape == input_tensor1.shape:
op_def.type = "Eltwise"
op_def.input.extend([input.name for input in op.inputs]) op_def.input.extend([input.name for input in op.inputs])
else: op_def.output.extend([output.name for output in op.outputs])
op_def.type = "CWise" self.add_output_shape(op.outputs, op_def)
x_value = 0 self.resolved_ops[op.name] = 1
if len(input_tensor1.shape)==4:
op_def.input.extend([op.inputs[1].name]) def convert_concat(self, op):
x_value = get_input_tensor(op, 0).eval().astype(np.float32) op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "Concat"
op_def.input.extend([input.name for input in op.inputs[:-1]])
op_def.output.extend([output.name for output in op.outputs])
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis = get_input_tensor(op, len(op.inputs) - 1).eval().astype(np.int32)
if self.device == 'neon' and axis == 3:
axis = 1
axis_arg.i = axis
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, len(op.inputs) - 1).name)
def convert_resize_bilinear(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "ResizeBilinear"
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'size'
size_arg.ints.extend(
get_input_tensor(op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add()
size_arg.name = 'align_corners'
size_arg.i = op.get_attr('align_corners')
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 1).name)
def convert_math(self, op, math_type):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
if len(op.inputs) == 1:
op_def.type = "CWise"
op_def.input.extend([input.name for input in op.inputs])
x_arg = op_def.arg.add()
x_arg.name = 'x'
x_arg.f = 0
elif len(op.inputs) >= 2:
input_tensor0 = get_input_tensor(op, 0)
input_tensor1 = get_input_tensor(op, 1)
if input_tensor0.shape == input_tensor1.shape:
op_def.type = "Eltwise"
op_def.input.extend([input.name for input in op.inputs])
else:
op_def.type = "CWise"
x_value = 0
if len(input_tensor1.shape) == 4:
op_def.input.extend([op.inputs[1].name])
x_value = get_input_tensor(op, 0).eval().astype(np.float32)
else:
op_def.input.extend([op.inputs[0].name])
x_value = get_input_tensor(op, 1).eval().astype(np.float32)
x_arg = op_def.arg.add()
x_arg.name = 'x'
x_arg.f = x_value
type_arg = op_def.arg.add()
type_arg.name = 'type'
type_arg.i = math_type_mode[math_type]
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_depth_to_space(self, op, d2s):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'block_size'
size_arg.i = op.get_attr('block_size')
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_bias_add(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "BiasAdd"
op_def.input.extend([op.inputs[0].name])
if self.device == 'gpu':
output_name = self.add_buffer_to_image(
get_input_tensor(op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else: else:
op_def.input.extend([op.inputs[0].name]) op_def.input.extend([get_input_tensor(op, 1).name])
x_value = get_input_tensor(op, 1).eval().astype(np.float32) op_def.output.extend([output.name for output in op.outputs])
x_arg = op_def.arg.add() self.add_output_shape(op.outputs, op_def)
x_arg.name = 'x' self.net_def.op.extend([op_def])
x_arg.f = x_value
type_arg = op_def.arg.add()
type_arg.name = 'type'
type_arg.i = math_type_mode[math_type]
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_depth_to_space(self, op, d2s):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'block_size'
size_arg.i = op.get_attr('block_size')
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_bias_add(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "BiasAdd"
op_def.input.extend([op.inputs[0].name])
if self.device == 'gpu':
output_name = self.add_buffer_to_image(get_input_tensor(op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(op, 1).name])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.net_def.op.extend([op_def])
self.resolved_ops[op.name] = 1
def convert_space_to_batch(self, op, b2s):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'block_shape'
size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add()
if b2s:
size_arg.name = 'crops'
else:
size_arg.name = 'paddings'
size_arg.ints.extend(get_input_tensor(op, 2).eval().astype(np.int32).flat)
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.unused_tensor.add(get_input_tensor(op, 2).name)
def is_atrous_conv2d(self, op):
return op.type == 'SpaceToBatchND' and\
len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Conv2D'
def convert_atrous_conv2d(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
conv_op = self.tf_graph[op.name][0]
op_def.name = conv_op.name
op_def.type = conv_op.type
self.transpose_filter_tensor[get_input_tensor(conv_op, 1).name] = (0, 1, 3, 2)
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
output_name = self.add_buffer_to_image(get_input_tensor(conv_op, 1).name, "CONV2D_FILTER")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(op, 0).name])
op_def.input.extend([get_input_tensor(conv_op, 1).name])
dilation_arg = op_def.arg.add()
dilation_arg.name = 'dilations'
dilation_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat
if len(padding_values) > 0 and padding_values[0] > 0:
padding_arg.i = padding_mode['SAME']
else:
padding_arg.i = padding_mode['VALID']
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.unused_tensor.add(get_input_tensor(op, 2).name)
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend([1, 1])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
final_op = conv_op
self.resolved_ops[op.name] = 1
self.resolved_ops[conv_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd' :
bias_add_op = self.tf_graph[final_op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 \
and self.tf_graph[final_op.name][0].type == 'BatchToSpaceND':
final_op = self.tf_graph[final_op.name][0]
self.resolved_ops[final_op.name] = 1
self.unused_tensor.add(get_input_tensor(final_op, 1).name)
self.unused_tensor.add(get_input_tensor(final_op, 2).name)
else:
raise Exception('Convert atrous conv error: no BatchToSpaceND op')
if len(self.tf_graph[final_op.name]) == 1 \
and self.tf_graph[final_op.name][0].type == 'Relu':
relu_op = self.tf_graph[final_op.name][0]
op_def.type = "FusedConv2D"
fused_relu_arg = op_def.arg.add()
fused_relu_arg.name = 'activation'
fused_relu_arg.s = "RELU"
final_op = relu_op
self.resolved_ops[relu_op.name] = 1
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
def is_softmax(self, op):
return op.type == 'Softmax' and \
len(self.tf_parents[op.name]) == 1 and self.tf_parents[op.name][0].type == 'Reshape' and \
len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Reshape'
def convert_softmax(self, softmax_op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
# deal with first Reshape op
parent_reshape_op = self.tf_parents[softmax_op.name][0]
self.unused_tensor.add(get_input_tensor(parent_reshape_op, 1).name)
self.resolved_ops[parent_reshape_op.name] = 1
# FIXME: hardcode for inception_v3
# remove squeeze if exist
squeeze_op = self.tf_parents[parent_reshape_op.name][0]
if squeeze_op.type == 'Squeeze':
op_def.input.extend([squeeze_op.inputs[0].name])
self.resolved_ops[squeeze_op.name] = 1
# remove shape if exist
children_ops = self.tf_graph[squeeze_op.name]
print children_ops
if len(children_ops) > 1 and children_ops[0].type == 'Shape':
self.unused_tensor.add(get_input_tensor(children_ops[1], 0).name)
self.resolved_ops[children_ops[1].name] = 1
else:
op_def.input.extend([parent_reshape_op.inputs[0].name])
# deal with Softmax op
op_def.name = softmax_op.name
op_def.type = softmax_op.type
self.resolved_ops[softmax_op.name] = 1
# deal with last Reshape op
reshape_op = self.tf_graph[softmax_op.name][0]
self.unused_tensor.add(get_input_tensor(reshape_op, 1).name)
shape = [dim.value for dim in reshape_op.outputs[0].shape]
if len(shape) == 2:
shape = [1, 1, shape[0], shape[1]]
op_def.output.extend([output.name for output in reshape_op.outputs])
self.add_output_shape([shape], op_def)
self.resolved_ops[reshape_op.name] = 1
def convert_normal_op(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def replace_in_out_name(self, input_names, output_names):
in_names = set([input_name + ":0" for input_name in input_names])
out_names = set([output_name + ":0" for output_name in output_names])
for op in self.net_def.op:
if op.input[0] in in_names:
op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0]
if op.output[0] in out_names:
op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0]
def convert(self, input_nodes, output_nodes):
if self.device == 'gpu':
self.add_gpu_input_transform(input_nodes)
if self.device == 'neon':
self.add_neon_input_transform(input_nodes)
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
if op.type in ['Placeholder', 'Identity']:
self.resolved_ops[op.name] = 1 self.resolved_ops[op.name] = 1
pass
elif op.type == 'Const': def convert_space_to_batch(self, op, b2s):
pass op_def = self.net_def.op.add()
elif op.type == 'Reshape': arg = op_def.arg.add()
self.convert_reshape(op) arg.name = 'T'
elif self.is_atrous_conv2d(op): arg.i = self.dt
self.convert_atrous_conv2d(op) op_def.name = op.name
elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative': op_def.type = op.type
if self.check_winograd_conv(op): op_def.input.extend([op.inputs[0].name])
self.convert_winograd_conv(op) op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'block_shape'
size_arg.ints.extend(
get_input_tensor(op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add()
if b2s:
size_arg.name = 'crops'
else: else:
self.convert_conv2d(op) size_arg.name = 'paddings'
elif op.type == 'FusedBatchNorm': size_arg.ints.extend(
self.convert_fused_batchnorm(op) get_input_tensor(op, 2).eval().astype(np.int32).flat)
elif op.type == 'Add' and op.name.endswith('batchnorm/add'): self.add_output_shape(op.outputs, op_def)
self.convert_batchnorm(op)
elif op.type == 'AvgPool' or op.type == 'MaxPool':
self.convert_pooling(op)
elif op.type == 'Relu6':
self.convert_relu6(op)
elif op.type == 'Add':
self.convert_add(op)
elif op.type == 'ConcatV2':
self.convert_concat(op)
elif op.type == 'ResizeBilinear':
self.convert_resize_bilinear(op)
elif op.type == 'BiasAdd':
self.convert_bias_add(op)
elif op.type == 'SpaceToBatchND':
self.convert_space_to_batch(op, False)
elif op.type == 'BatchToSpaceND':
self.convert_space_to_batch(op, True)
elif op.type == 'DepthToSpace':
self.convert_depth_to_space(op, True)
elif op.type == 'SpaceToDepth':
self.convert_depth_to_space(op, False)
elif op.type in ['Neg', 'neg', 'Negative', 'negative']:
self.convert_math(op, 'NEG')
elif op.type == 'Mul':
self.convert_math(op, 'MUL')
elif op.type == 'Sub':
self.convert_math(op, 'SUB')
elif self.is_softmax(op):
self.convert_softmax(op)
elif op.type in ['Relu', 'Sigmoid', 'Tanh']:
self.convert_activation(op)
# FIXME: hardcode for inception_v3
elif op.type in ['Squeeze', 'Shape']:
self.resolved_ops[op.name] = 1 self.resolved_ops[op.name] = 1
elif op.type == 'Mean': self.unused_tensor.add(get_input_tensor(op, 1).name)
# Global avg pooling self.unused_tensor.add(get_input_tensor(op, 2).name)
reduce_dims = op.inputs[1].eval()
if reduce_dims[0] == 1 and reduce_dims[1] == 2: def is_atrous_conv2d(self, op):
self.convert_global_avg_pooling(op) return op.type == 'SpaceToBatchND' and \
self.unused_tensor.add(op.inputs[1].name) len(self.tf_graph[op.name]) == 1 and \
self.tf_graph[op.name][0].type == 'Conv2D'
def convert_atrous_conv2d(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
conv_op = self.tf_graph[op.name][0]
op_def.name = conv_op.name
op_def.type = conv_op.type
self.transpose_filter_tensor[get_input_tensor(conv_op,
1).name] = (0, 1, 3, 2)
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
output_name = self.add_buffer_to_image(
get_input_tensor(conv_op, 1).name, "CONV2D_FILTER")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(op, 0).name])
op_def.input.extend([get_input_tensor(conv_op, 1).name])
dilation_arg = op_def.arg.add()
dilation_arg.name = 'dilations'
dilation_arg.ints.extend(
get_input_tensor(op, 1).eval().astype(np.int32).flat)
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat
if len(padding_values) > 0 and padding_values[0] > 0:
padding_arg.i = padding_mode['SAME']
else: else:
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) padding_arg.i = padding_mode['VALID']
#elif op.type in ['']: self.unused_tensor.add(get_input_tensor(op, 1).name)
# self.convert_normal_op(op) self.unused_tensor.add(get_input_tensor(op, 2).name)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend([1, 1])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
final_op = conv_op
self.resolved_ops[op.name] = 1
self.resolved_ops[conv_op.name] = 1
if len(self.tf_graph[final_op.name]
) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd':
bias_add_op = self.tf_graph[final_op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(
get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 and \
self.tf_graph[final_op.name][0].type == 'BatchToSpaceND':
final_op = self.tf_graph[final_op.name][0]
self.resolved_ops[final_op.name] = 1
self.unused_tensor.add(get_input_tensor(final_op, 1).name)
self.unused_tensor.add(get_input_tensor(final_op, 2).name)
else:
raise Exception('Convert atrous conv error: no BatchToSpaceND op')
if len(self.tf_graph[final_op.name]) == 1 and \
self.tf_graph[final_op.name][0].type == 'Relu':
relu_op = self.tf_graph[final_op.name][0]
op_def.type = "FusedConv2D"
fused_relu_arg = op_def.arg.add()
fused_relu_arg.name = 'activation'
fused_relu_arg.s = "RELU"
final_op = relu_op
self.resolved_ops[relu_op.name] = 1
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
def is_softmax(self, op):
return op.type == 'Softmax' and \
len(self.tf_parents[op.name]) == 1 and \
self.tf_parents[op.name][0].type == 'Reshape' and \
len(self.tf_graph[op.name]) == 1 and \
self.tf_graph[op.name][0].type == 'Reshape'
def convert_softmax(self, softmax_op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
# deal with first Reshape op
parent_reshape_op = self.tf_parents[softmax_op.name][0]
self.unused_tensor.add(get_input_tensor(parent_reshape_op, 1).name)
self.resolved_ops[parent_reshape_op.name] = 1
# FIXME: hardcode for inception_v3
# remove squeeze if exist
squeeze_op = self.tf_parents[parent_reshape_op.name][0]
if squeeze_op.type == 'Squeeze':
op_def.input.extend([squeeze_op.inputs[0].name])
self.resolved_ops[squeeze_op.name] = 1
# remove shape if exist
children_ops = self.tf_graph[squeeze_op.name]
print children_ops
if len(children_ops) > 1 and children_ops[0].type == 'Shape':
self.unused_tensor.add(
get_input_tensor(children_ops[1], 0).name)
self.resolved_ops[children_ops[1].name] = 1
else:
op_def.input.extend([parent_reshape_op.inputs[0].name])
# deal with Softmax op
op_def.name = softmax_op.name
op_def.type = softmax_op.type
self.resolved_ops[softmax_op.name] = 1
# deal with last Reshape op
reshape_op = self.tf_graph[softmax_op.name][0]
self.unused_tensor.add(get_input_tensor(reshape_op, 1).name)
shape = [dim.value for dim in reshape_op.outputs[0].shape]
if len(shape) == 2:
shape = [1, 1, shape[0], shape[1]]
op_def.output.extend([output.name for output in reshape_op.outputs])
self.add_output_shape([shape], op_def)
self.resolved_ops[reshape_op.name] = 1
def convert_normal_op(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
for op in self.tf_ops: def replace_in_out_name(self, input_names, output_names):
if self.resolved_ops[op.name] == 1: in_names = set([input_name + ":0" for input_name in input_names])
continue out_names = set([output_name + ":0" for output_name in output_names])
elif op.type == 'Const': for op in self.net_def.op:
self.convert_tensor(op) if op.input[0] in in_names:
else: op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0]
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) if op.output[0] in out_names:
op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0]
if self.device == 'gpu': def convert(self, input_nodes, output_nodes):
self.add_gpu_output_transform(output_nodes) if self.device == 'gpu':
self.add_gpu_input_transform(input_nodes)
if self.device == 'neon':
self.add_neon_input_transform(input_nodes)
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
if op.type in ['Placeholder', 'Identity']:
self.resolved_ops[op.name] = 1
pass
elif op.type == 'Const':
pass
elif op.type == 'Reshape':
self.convert_reshape(op)
elif self.is_atrous_conv2d(op):
self.convert_atrous_conv2d(op)
elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
if self.check_winograd_conv(op):
self.convert_winograd_conv(op)
else:
self.convert_conv2d(op)
elif op.type == 'FusedBatchNorm':
self.convert_fused_batchnorm(op)
elif op.type == 'Add' and op.name.endswith('batchnorm/add'):
self.convert_batchnorm(op)
elif op.type == 'AvgPool' or op.type == 'MaxPool':
self.convert_pooling(op)
elif op.type == 'Relu6':
self.convert_relu6(op)
elif op.type == 'Add':
self.convert_add(op)
elif op.type == 'ConcatV2':
self.convert_concat(op)
elif op.type == 'ResizeBilinear':
self.convert_resize_bilinear(op)
elif op.type == 'BiasAdd':
self.convert_bias_add(op)
elif op.type == 'SpaceToBatchND':
self.convert_space_to_batch(op, False)
elif op.type == 'BatchToSpaceND':
self.convert_space_to_batch(op, True)
elif op.type == 'DepthToSpace':
self.convert_depth_to_space(op, True)
elif op.type == 'SpaceToDepth':
self.convert_depth_to_space(op, False)
elif op.type in ['Neg', 'neg', 'Negative', 'negative']:
self.convert_math(op, 'NEG')
elif op.type == 'Mul':
self.convert_math(op, 'MUL')
elif op.type == 'Sub':
self.convert_math(op, 'SUB')
elif self.is_softmax(op):
self.convert_softmax(op)
elif op.type in ['Relu', 'Sigmoid', 'Tanh']:
self.convert_activation(op)
# FIXME: hardcode for inception_v3
elif op.type in ['Squeeze', 'Shape']:
self.resolved_ops[op.name] = 1
elif op.type == 'Mean':
# Global avg pooling
reduce_dims = op.inputs[1].eval()
if reduce_dims[0] == 1 and reduce_dims[1] == 2:
self.convert_global_avg_pooling(op)
self.unused_tensor.add(op.inputs[1].name)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name,
op.type))
# elif op.type in ['']:
# self.convert_normal_op(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name,
op.type))
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
elif op.type == 'Const':
self.convert_tensor(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name,
op.type))
if self.device == 'gpu':
self.add_gpu_output_transform(output_nodes)
if self.device == 'neon': if self.device == 'neon':
self.add_neon_output_transform(output_nodes) self.add_neon_output_transform(output_nodes)
if self.device == 'cpu': if self.device == 'cpu':
self.replace_in_out_name(input_nodes, output_nodes) self.replace_in_out_name(input_nodes, output_nodes)
for key in self.resolved_ops: for key in self.resolved_ops:
if self.resolved_ops[key] != 1: if self.resolved_ops[key] != 1:
print 'Unresolve Op: %s' % key print 'Unresolve Op: %s' % key
class Optimizer:
def __init__(self, net_def, device):
self.net_def = net_def
self.device = device
self.mace_graph = {}
self.tensor_map = {}
for op in net_def.op:
for input_name in op.input:
if input_name not in self.mace_graph:
self.mace_graph[input_name] = []
self.mace_graph[input_name].append(op)
for tensor in net_def.tensors:
self.tensor_map[tensor.name] = tensor
def get_buffer_tensor_name(self, name):
if self.device == 'gpu':
return name[:-6] + name[-2:]
else:
return name
def fold_batch_norm(self):
unused_tensors = set()
new_tensors = []
new_net = mace_pb2.NetDef()
resolved_ops = set()
for op in self.net_def.op:
if op.name in resolved_ops:
pass
elif op.type == 'DepthwiseConv2d' and len(op.output) == 1 \
and self.mace_graph[op.output[0]][0].type == 'FoldedBatchNorm':
depthwise_conv2d_op = op
folded_bn_op = self.mace_graph[op.output[0]][0]
weight_buffer_name = self.get_buffer_tensor_name(depthwise_conv2d_op.input[1])
weight_tensor = self.tensor_map[weight_buffer_name]
scale_buffer_name = self.get_buffer_tensor_name(folded_bn_op.input[1])
offset_buffer_name = self.get_buffer_tensor_name(folded_bn_op.input[2])
scale_tensor = self.tensor_map[scale_buffer_name]
weight_shape = weight_tensor.dims
idx = 0
if self.device == 'neon': # OIHW
for oc in range(weight_shape[0]):
for ic in range(weight_shape[1]):
for i in range(weight_shape[2]):
for j in range(weight_shape[3]):
weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[0] + oc]
idx += 1
else: # HWIO
for i in range(weight_shape[0]):
for j in range(weight_shape[1]):
for ic in range(weight_shape[2]):
for oc in range(weight_shape[3]):
weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[3] + oc]
idx += 1
new_tensors.append(weight_tensor)
unused_tensors.add(weight_tensor.name)
unused_tensors.add(scale_tensor.name)
class Optimizer:
def __init__(self, net_def, device):
self.net_def = net_def
self.device = device
self.mace_graph = {}
self.tensor_map = {}
for op in net_def.op:
for input_name in op.input:
if input_name not in self.mace_graph:
self.mace_graph[input_name] = []
self.mace_graph[input_name].append(op)
for tensor in net_def.tensors:
self.tensor_map[tensor.name] = tensor
def get_buffer_tensor_name(self, name):
if self.device == 'gpu': if self.device == 'gpu':
scale_b2i_op = self.mace_graph[scale_buffer_name][0] return name[:-6] + name[-2:]
offset_b2i_op = self.mace_graph[offset_buffer_name][0] else:
resolved_ops.add(scale_b2i_op.name) return name
resolved_ops.add(offset_b2i_op.name)
new_net.op.extend([offset_b2i_op]) def fold_batch_norm(self):
unused_tensors = set()
resolved_ops.add(depthwise_conv2d_op.name) new_tensors = []
resolved_ops.add(folded_bn_op.name) new_net = mace_pb2.NetDef()
resolved_ops = set()
offset_tensor_name = folded_bn_op.input[2]
depthwise_conv2d_op.input.extend([offset_tensor_name]) for op in self.net_def.op:
if op.name in resolved_ops:
for arg in folded_bn_op.arg: pass
if arg.name == 'activation': elif op.type == 'DepthwiseConv2d' and len(op.output) == 1 and \
act_arg = depthwise_conv2d_op.arg.add() self.mace_graph[op.output[0]][0].type == 'FoldedBatchNorm':
act_arg.name = arg.name depthwise_conv2d_op = op
act_arg.s = arg.s folded_bn_op = self.mace_graph[op.output[0]][0]
elif arg.name == 'max_limit': weight_buffer_name = self.get_buffer_tensor_name(
act_arg = depthwise_conv2d_op.arg.add() depthwise_conv2d_op.input[1])
act_arg.name = arg.name weight_tensor = self.tensor_map[weight_buffer_name]
act_arg.f = arg.f scale_buffer_name = self.get_buffer_tensor_name(
folded_bn_op.input[1])
depthwise_conv2d_op.output[0] = folded_bn_op.output[0] offset_buffer_name = self.get_buffer_tensor_name(
new_net.op.extend([depthwise_conv2d_op]) folded_bn_op.input[2])
else: scale_tensor = self.tensor_map[scale_buffer_name]
new_net.op.extend([op]) weight_shape = weight_tensor.dims
idx = 0
for tensor in self.net_def.tensors: if self.device == 'neon': # OIHW
if tensor.name in unused_tensors: for oc in range(weight_shape[0]):
pass for ic in range(weight_shape[1]):
else: for i in range(weight_shape[2]):
new_net.tensors.extend([tensor]) for j in range(weight_shape[3]):
weight_tensor.float_data[
for tensor in new_tensors: idx] *= scale_tensor.float_data[
new_net.tensors.extend([tensor]) ic * weight_shape[0] + oc]
idx += 1
return new_net else: # HWIO
for i in range(weight_shape[0]):
def optimize(self): for j in range(weight_shape[1]):
new_net = self.fold_batch_norm() for ic in range(weight_shape[2]):
return new_net for oc in range(weight_shape[3]):
weight_tensor.float_data[
idx] *= scale_tensor.float_data[
ic * weight_shape[3] + oc]
idx += 1
new_tensors.append(weight_tensor)
unused_tensors.add(weight_tensor.name)
unused_tensors.add(scale_tensor.name)
if self.device == 'gpu':
scale_b2i_op = self.mace_graph[scale_buffer_name][0]
offset_b2i_op = self.mace_graph[offset_buffer_name][0]
resolved_ops.add(scale_b2i_op.name)
resolved_ops.add(offset_b2i_op.name)
new_net.op.extend([offset_b2i_op])
resolved_ops.add(depthwise_conv2d_op.name)
resolved_ops.add(folded_bn_op.name)
offset_tensor_name = folded_bn_op.input[2]
depthwise_conv2d_op.input.extend([offset_tensor_name])
for arg in folded_bn_op.arg:
if arg.name == 'activation':
act_arg = depthwise_conv2d_op.arg.add()
act_arg.name = arg.name
act_arg.s = arg.s
elif arg.name == 'max_limit':
act_arg = depthwise_conv2d_op.arg.add()
act_arg.name = arg.name
act_arg.f = arg.f
depthwise_conv2d_op.output[0] = folded_bn_op.output[0]
new_net.op.extend([depthwise_conv2d_op])
else:
new_net.op.extend([op])
for tensor in self.net_def.tensors:
if tensor.name in unused_tensors:
pass
else:
new_net.tensors.extend([tensor])
for tensor in new_tensors:
new_net.tensors.extend([tensor])
return new_net
def optimize(self):
new_net = self.fold_batch_norm()
return new_net
def add_shape_info(input_graph_def, input_nodes, input_shapes): def add_shape_info(input_graph_def, input_nodes, input_shapes):
inputs_replaced_graph = graph_pb2.GraphDef() inputs_replaced_graph = graph_pb2.GraphDef()
for node in input_graph_def.node: for node in input_graph_def.node:
if node.name in input_nodes: if node.name in input_nodes:
idx = input_nodes.index(node.name) idx = input_nodes.index(node.name)
input_shape = input_shapes[idx] input_shape = input_shapes[idx]
placeholder_node = copy.deepcopy(node) placeholder_node = copy.deepcopy(node)
placeholder_node.attr.clear() placeholder_node.attr.clear()
placeholder_node.attr['shape'].shape.dim.extend([ placeholder_node.attr['shape'].shape.dim.extend([
tensor_shape_pb2.TensorShapeProto.Dim(size=i) for i in input_shape tensor_shape_pb2.TensorShapeProto.Dim(size=i)
]) for i in input_shape
placeholder_node.attr['dtype'].CopyFrom(node.attr['dtype']) ])
inputs_replaced_graph.node.extend([placeholder_node]) placeholder_node.attr['dtype'].CopyFrom(node.attr['dtype'])
else: inputs_replaced_graph.node.extend([placeholder_node])
inputs_replaced_graph.node.extend([copy.deepcopy(node)]) else:
return inputs_replaced_graph inputs_replaced_graph.node.extend([copy.deepcopy(node)])
return inputs_replaced_graph
def convert_to_mace_pb(model_file, input_node, input_shape, output_node, data_type, device, winograd):
net_def = mace_pb2.NetDef() def convert_to_mace_pb(model_file, input_node, input_shape, output_node,
dt = data_type_map[data_type] data_type, device, winograd):
net_def = mace_pb2.NetDef()
input_graph_def = tf.GraphDef() dt = data_type_map[data_type]
with gfile.Open(model_file, "rb") as f:
data = f.read() input_graph_def = tf.GraphDef()
input_graph_def.ParseFromString(data) with gfile.Open(model_file, "rb") as f:
data = f.read()
input_nodes = [x for x in input_node.split(',')] input_graph_def.ParseFromString(data)
input_shapes = []
if input_shape != "": input_nodes = [x for x in input_node.split(',')]
input_shape_strs = [x for x in input_shape.split(':')] input_shapes = []
for shape_str in input_shape_strs: if input_shape != "":
input_shapes.extend([[int(x) for x in shape_str.split(',')]]) input_shape_strs = [x for x in input_shape.split(':')]
output_nodes = [x for x in output_node.split(',')] for shape_str in input_shape_strs:
assert len(input_nodes) == len(input_shapes) input_shapes.extend([[int(x) for x in shape_str.split(',')]])
output_nodes = [x for x in output_node.split(',')]
input_graph_def = add_shape_info(input_graph_def, input_nodes, input_shapes) assert len(input_nodes) == len(input_shapes)
with tf.Session() as session:
with session.graph.as_default() as graph: input_graph_def = add_shape_info(input_graph_def, input_nodes,
tf.import_graph_def(input_graph_def, name="") input_shapes)
ops = graph.get_operations() with tf.Session() as session:
converter = TFConverter(ops, net_def, dt, device, winograd) with session.graph.as_default() as graph:
converter.convert(input_nodes, output_nodes) tf.import_graph_def(input_graph_def, name="")
optimizer = Optimizer(net_def, device) ops = graph.get_operations()
net_def = optimizer.optimize() converter = TFConverter(ops, net_def, dt, device, winograd)
print "Model Converted." converter.convert(input_nodes, output_nodes)
if device == 'gpu': optimizer = Optimizer(net_def, device)
print "start optimize memory." net_def = optimizer.optimize()
mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) print "Model Converted."
mem_optimizer.optimize() if device == 'gpu':
print "Memory optimization done." print "start optimize memory."
mem_optimizer = memory_optimizer.MemoryOptimizer(net_def)
return net_def mem_optimizer.optimize()
print "Memory optimization done."
return net_def
...@@ -6,452 +6,521 @@ from dsp_ops import DspOps ...@@ -6,452 +6,521 @@ from dsp_ops import DspOps
from mace.python.tools import graph_util from mace.python.tools import graph_util
from mace.python.tools.convert_util import tf_dtype_2_mace_dtype from mace.python.tools.convert_util import tf_dtype_2_mace_dtype
# converter --input ../libcv/quantized_model.pb --output quantized_model_dsp.pb \ # converter --input ../libcv/quantized_model.pb \
# --runtime dsp --input_node input_node --output_node output_node # --output quantized_model_dsp.pb \
# --runtime dsp --input_node input_node \
# --output_node output_node
padding_mode = { padding_mode = {
'NA': 0, 'NA': 0,
'SAME': 1, 'SAME': 1,
'VALID': 2, 'VALID': 2,
'MIRROR_REFLECT': 3, 'MIRROR_REFLECT': 3,
'MIRROR_SYMMETRIC': 4, 'MIRROR_SYMMETRIC': 4,
'SAME_CAFFE': 5 'SAME_CAFFE': 5
} }
def get_tensor_name_from_op(op_name, port): def get_tensor_name_from_op(op_name, port):
return op_name + ':' + str(port) return op_name + ':' + str(port)
def get_node_from_map(op_map, op_or_tensor_name): def get_node_from_map(op_map, op_or_tensor_name):
op_name = op_or_tensor_name.split(':')[0] op_name = op_or_tensor_name.split(':')[0]
return op_map[op_name] return op_map[op_name]
def get_op_and_port_from_tensor(tensor_name): def get_op_and_port_from_tensor(tensor_name):
op, port = tensor_name.split(':') op, port = tensor_name.split(':')
port = int(port) port = int(port)
return op, port return op, port
def max_elem_size(tensor): def max_elem_size(tensor):
if len(tensor.shape.as_list()) == 0: if len(tensor.shape.as_list()) == 0:
return tensor.dtype.size return tensor.dtype.size
else: else:
return reduce(mul, tensor.shape.as_list()) * tensor.dtype.size return reduce(mul, tensor.shape.as_list()) * tensor.dtype.size
def find_dtype(tensor_dtype): def find_dtype(tensor_dtype):
if tensor_dtype == tf.float32: if tensor_dtype == tf.float32:
return mace_pb2.DT_FLOAT return mace_pb2.DT_FLOAT
elif tensor_dtype == tf.uint8 or tensor_dtype == tf.quint8: elif tensor_dtype == tf.uint8 or tensor_dtype == tf.quint8:
return mace_pb2.DT_UINT8 return mace_pb2.DT_UINT8
elif tensor_dtype == tf.int32 or tensor_dtype == tf.qint32: elif tensor_dtype == tf.int32 or tensor_dtype == tf.qint32:
return mace_pb2.DT_INT32 return mace_pb2.DT_INT32
else: else:
raise Exception('Unsupported data type: ', tensor_dtype) raise Exception('Unsupported data type: ', tensor_dtype)
def has_padding_and_strides(op): def has_padding_and_strides(op):
return 'padding' in op.node_def.attr and 'strides' in op.node_def.attr return 'padding' in op.node_def.attr and 'strides' in op.node_def.attr
def is_node_flatten_reshape(op): def is_node_flatten_reshape(op):
return op.type == 'Reshape' and len(op.outputs[0].shape) == 1 return op.type == 'Reshape' and len(op.outputs[0].shape) == 1
def get_input_tensor(op, index): def get_input_tensor(op, index):
input_tensor = op.inputs[index] input_tensor = op.inputs[index]
if input_tensor.op.type == 'Reshape': if input_tensor.op.type == 'Reshape':
input_tensor = get_input_tensor(input_tensor.op, 0) input_tensor = get_input_tensor(input_tensor.op, 0)
return input_tensor return input_tensor
def add_shape_const_node(net_def, op, values, name): def add_shape_const_node(net_def, op, values, name):
print ('Add const node: ', op.name + '/' + name) print('Add const node: ', op.name + '/' + name)
tensor = net_def.tensors.add() tensor = net_def.tensors.add()
node_name = op.name + '/' + name node_name = op.name + '/' + name
tensor.name = node_name + ':0' tensor.name = node_name + ':0'
tensor.data_type = mace_pb2.DT_INT32 tensor.data_type = mace_pb2.DT_INT32
tensor.dims.extend(values) tensor.dims.extend(values)
return tensor.name return tensor.name
def convert_op_outputs(mace_op_def, tf_op): def convert_op_outputs(mace_op_def, tf_op):
mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype) mace_op_def.output_type.extend(
for output in tf_op.outputs]) [tf_dtype_2_mace_dtype(output.dtype) for output in tf_op.outputs])
output_shapes = [] output_shapes = []
for output in tf_op.outputs: for output in tf_op.outputs:
output_shape = mace_pb2.OutputShape() output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list()) output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape) output_shapes.append(output_shape)
mace_op_def.output_shape.extend(output_shapes) mace_op_def.output_shape.extend(output_shapes)
def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops): def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
first_op = unresolved_ops[0] first_op = unresolved_ops[0]
print ('Op: ', first_op.name, first_op.type, first_op.outputs[0].shape) print('Op: ', first_op.name, first_op.type, first_op.outputs[0].shape)
if first_op.name in resolved_ops:
pass
elif first_op.type == 'Const':
print('Add const node: ', first_op.name)
tf_tensor = first_op.outputs[0].eval()
tensor = net_def.tensors.add()
tensor.name = first_op.outputs[0].name
tensor.data_type = find_dtype(first_op.outputs[0].dtype)
shape = list(tf_tensor.shape)
if len(shape) > 0:
tensor.dims.extend(shape)
if first_op.outputs[0].dtype == tf.float32:
tensor.float_data.extend(tf_tensor.astype(float).flat)
elif first_op.outputs[0].dtype == tf.int32 or \
first_op.outputs[0].dtype == tf.int8 or \
first_op.outputs[0].dtype == tf.int16 or \
first_op.outputs[0].dtype == tf.quint8 or \
first_op.outputs[0].dtype == tf.quint16:
tensor.int32_data.extend(tf_tensor.astype(int).flat)
if first_op.name in resolved_ops:
pass
elif first_op.type == 'Const':
print ('Add const node: ', first_op.name)
tf_tensor = first_op.outputs[0].eval()
tensor = net_def.tensors.add()
tensor.name = first_op.outputs[0].name
tensor.data_type = find_dtype(first_op.outputs[0].dtype)
shape = list(tf_tensor.shape)
if len(shape) > 0:
tensor.dims.extend(shape)
if first_op.outputs[0].dtype == tf.float32:
tensor.float_data.extend(tf_tensor.astype(float).flat)
elif first_op.outputs[0].dtype == tf.int32 or \
first_op.outputs[0].dtype == tf.int8 or \
first_op.outputs[0].dtype == tf.int16 or \
first_op.outputs[0].dtype == tf.quint8 or \
first_op.outputs[0].dtype == tf.quint16:
tensor.int32_data.extend(tf_tensor.astype(int).flat)
else:
op_def = net_def.op.add()
op_def.name = first_op.name
op_def.type = dsp_ops.map_nn_op(first_op.type)
op_def.padding = padding_mode['NA']
if len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \
and len(first_op.outputs[0].consumers()) > 0 \
and (first_op.outputs[0].consumers()[0].type == 'SpaceToBatchND' \
or first_op.outputs[0].consumers()[0].type == 'BatchToSpaceND'):
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[1]
max_tensor = first_op.inputs[2]
s2b_op = first_op.outputs[0].consumers()[0]
reshape_op = s2b_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
resolved_ops.add(s2b_op.name)
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
op_def.name = quantize_op.name
op_def.type = dsp_ops.map_nn_op('Quantized' + s2b_op.type)
op_def.input.append(input_tensor.name)
op_def.input.extend([t.name for t in s2b_op.inputs[1:]])
op_def.input.extend([min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_op.outputs])
convert_op_outputs(op_def, quantize_op)
elif len(first_op.outputs) > 0 and first_op.type == 'QuantizedReshape' \
and len(first_op.outputs[0].consumers()) > 0 \
and first_op.outputs[0].consumers()[0].type == 'Dequantize' \
and len(first_op.outputs[0].consumers()[0].outputs[0].consumers()) > 0 \
and first_op.outputs[0].consumers()[0].outputs[0].consumers()[0].type == 'Softmax':
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[2]
max_tensor = first_op.inputs[3]
dequantize_op = first_op.outputs[0].consumers()[0]
softmax_op = dequantize_op.outputs[0].consumers()[0]
reshape_op = softmax_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
quantize_reshape_op = quantize_op.outputs[0].consumers()[0]
resolved_ops.add(dequantize_op.name)
resolved_ops.add(softmax_op.name)
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
resolved_ops.add(quantize_reshape_op.name)
op_def.name = quantize_reshape_op.name
op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax')
op_def.input.extend([input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_reshape_op.outputs])
convert_op_outputs(op_def, quantize_reshape_op)
elif len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \
and len(first_op.outputs[0].consumers()) > 0 \
and first_op.outputs[0].consumers()[0].type == 'Tanh':
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[1]
max_tensor = first_op.inputs[2]
tanh_op = first_op.outputs[0].consumers()[0]
# if not last op
resolved_ops.add(tanh_op.name)
if tanh_op.outputs[0].consumers():
reshape_op = tanh_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
op_def.name = quantize_op.name
op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type)
op_def.input.extend([input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_op.outputs])
convert_op_outputs(op_def, quantize_op)
# tanh is last op
else:
op_def.name = tanh_op.name + '/QuantizedTanh'
op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type)
op_def.input.extend([input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([max_elem_size(input_tensor),
max_elem_size(min_tensor),
max_elem_size(max_tensor)])
op_def.output_type.extend([mace_pb2.DT_UINT8, mace_pb2.DT_FLOAT, mace_pb2.DT_FLOAT])
output_shapes = []
for output in first_op.inputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
new_tanh_op_def = net_def.op.add()
new_tanh_op_def.name = tanh_op.name
new_tanh_op_def.type = dsp_ops.map_nn_op('Dequantize')
new_tanh_op_def.input.extend([get_tensor_name_from_op(op_def.name, 0),
get_tensor_name_from_op(op_def.name, 1),
get_tensor_name_from_op(op_def.name, 2)])
new_tanh_op_def.out_max_byte_size.extend([max_elem_size(tanh_op.outputs[0])])
convert_op_outputs(new_tanh_op_def, tanh_op)
elif has_padding_and_strides(first_op):
op_def.padding = padding_mode[first_op.get_attr('padding')]
op_def.input.extend([t.name for t in first_op.inputs])
if 'ksize' in first_op.node_def.attr:
ksize = first_op.get_attr('ksize')
ksize_tensor = add_shape_const_node(net_def, first_op, ksize, 'ksize')
op_def.input.extend([ksize_tensor])
strides = first_op.get_attr('strides')
strides_tensor = add_shape_const_node(net_def, first_op, strides, 'strides')
op_def.input.extend([strides_tensor])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif is_node_flatten_reshape(first_op):
op_def.type = 'Flatten'
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif dsp_ops.has_op(first_op.type):
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
else: else:
raise Exception('Unsupported op: ', first_op) op_def = net_def.op.add()
op_def.name = first_op.name
op_def.type = dsp_ops.map_nn_op(first_op.type)
op_def.padding = padding_mode['NA']
if len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \
and len(first_op.outputs[0].consumers()) > 0 \
and (first_op.outputs[0].consumers()[0].type == 'SpaceToBatchND' or
first_op.outputs[0].consumers()[0].type == 'BatchToSpaceND'):
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[1]
max_tensor = first_op.inputs[2]
s2b_op = first_op.outputs[0].consumers()[0]
reshape_op = s2b_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
resolved_ops.add(s2b_op.name)
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
op_def.name = quantize_op.name
op_def.type = dsp_ops.map_nn_op('Quantized' + s2b_op.type)
op_def.input.append(input_tensor.name)
op_def.input.extend([t.name for t in s2b_op.inputs[1:]])
op_def.input.extend([min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend(
[max_elem_size(out) for out in quantize_op.outputs])
convert_op_outputs(op_def, quantize_op)
elif len(first_op.outputs) > 0 and \
first_op.type == 'QuantizedReshape' and \
len(first_op.outputs[0].consumers()) > 0 and \
first_op.outputs[0].consumers()[0].type == 'Dequantize' and \
len(first_op.outputs[0].consumers()[0].outputs[0].consumers()) \
> 0 and \
first_op.outputs[0].consumers()[0].outputs[0].consumers()[0].type \
== 'Softmax':
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[2]
max_tensor = first_op.inputs[3]
dequantize_op = first_op.outputs[0].consumers()[0]
softmax_op = dequantize_op.outputs[0].consumers()[0]
reshape_op = softmax_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
quantize_reshape_op = quantize_op.outputs[0].consumers()[0]
resolved_ops.add(dequantize_op.name)
resolved_ops.add(softmax_op.name)
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
resolved_ops.add(quantize_reshape_op.name)
op_def.name = quantize_reshape_op.name
op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax')
op_def.input.extend(
[input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend(
[max_elem_size(out) for out in quantize_reshape_op.outputs])
convert_op_outputs(op_def, quantize_reshape_op)
elif len(first_op.outputs) > 0 and first_op.type == 'Dequantize' and \
len(first_op.outputs[0].consumers()) > 0 and \
first_op.outputs[0].consumers()[0].type == 'Tanh':
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[1]
max_tensor = first_op.inputs[2]
tanh_op = first_op.outputs[0].consumers()[0]
# if not last op
resolved_ops.add(tanh_op.name)
if tanh_op.outputs[0].consumers():
reshape_op = tanh_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
op_def.name = quantize_op.name
op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type)
op_def.input.extend(
[input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend(
[max_elem_size(out) for out in quantize_op.outputs])
convert_op_outputs(op_def, quantize_op)
# tanh is last op
else:
op_def.name = tanh_op.name + '/QuantizedTanh'
op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type)
op_def.input.extend(
[input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([
max_elem_size(input_tensor),
max_elem_size(min_tensor),
max_elem_size(max_tensor)
])
op_def.output_type.extend(
[mace_pb2.DT_UINT8, mace_pb2.DT_FLOAT, mace_pb2.DT_FLOAT])
output_shapes = []
for output in first_op.inputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
new_tanh_op_def = net_def.op.add()
new_tanh_op_def.name = tanh_op.name
new_tanh_op_def.type = dsp_ops.map_nn_op('Dequantize')
new_tanh_op_def.input.extend([
get_tensor_name_from_op(op_def.name, 0),
get_tensor_name_from_op(op_def.name, 1),
get_tensor_name_from_op(op_def.name, 2)
])
new_tanh_op_def.out_max_byte_size.extend(
[max_elem_size(tanh_op.outputs[0])])
convert_op_outputs(new_tanh_op_def, tanh_op)
elif has_padding_and_strides(first_op):
op_def.padding = padding_mode[first_op.get_attr('padding')]
op_def.input.extend([t.name for t in first_op.inputs])
if 'ksize' in first_op.node_def.attr:
ksize = first_op.get_attr('ksize')
ksize_tensor = add_shape_const_node(net_def, first_op, ksize,
'ksize')
op_def.input.extend([ksize_tensor])
strides = first_op.get_attr('strides')
strides_tensor = add_shape_const_node(net_def, first_op, strides,
'strides')
op_def.input.extend([strides_tensor])
op_def.out_max_byte_size.extend(
[max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif is_node_flatten_reshape(first_op):
op_def.type = 'Flatten'
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend(
[max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif dsp_ops.has_op(first_op.type):
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend(
[max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
else:
raise Exception('Unsupported op: ', first_op)
resolved_ops.add(first_op.name)
del unresolved_ops[0]
resolved_ops.add(first_op.name)
del unresolved_ops[0]
def add_output_node(net_def, output_node): def add_output_node(net_def, output_node):
op_def = net_def.op.add() op_def = net_def.op.add()
op_def.name = '__output__' op_def.name = '__output__'
op_def.type = 'OUTPUT' op_def.type = 'OUTPUT'
op_def.input.extend([get_tensor_name_from_op(output_node, 0)]) op_def.input.extend([get_tensor_name_from_op(output_node, 0)])
def reverse_batch_to_space_and_biasadd(net_def): def reverse_batch_to_space_and_biasadd(net_def):
tensor_map = {} tensor_map = {}
for tensor in net_def.tensors: for tensor in net_def.tensors:
tensor_map[tensor.name] = tensor tensor_map[tensor.name] = tensor
op_map = {} op_map = {}
for op in net_def.op: for op in net_def.op:
op_map[op.name] = op op_map[op.name] = op
consumers = {} consumers = {}
for op in net_def.op: for op in net_def.op:
for ipt in op.input: for ipt in op.input:
if ipt not in consumers: if ipt not in consumers:
consumers[ipt] = [] consumers[ipt] = []
consumers[ipt].append(op) consumers[ipt].append(op)
new_ops = [] new_ops = []
skip_ops = set() skip_ops = set()
visited_ops = set() visited_ops = set()
for op in net_def.op: for op in net_def.op:
if op.name in visited_ops: if op.name in visited_ops:
pass pass
# pattern: QConv -> RR -> R -> QB2S -> QBiasAdd -> RR -> R # pattern: QConv -> RR -> R -> QB2S -> QBiasAdd -> RR -> R
success = False success = False
if op.type == 'Requantize_32to8': if op.type == 'Requantize_32to8':
biasadd_requantize_op = op biasadd_requantize_op = op
biasadd_op = get_node_from_map(op_map, biasadd_requantize_op.input[0]) biasadd_op = get_node_from_map(op_map,
if biasadd_op.type == 'QuantizedBiasAdd_8p8to32': biasadd_requantize_op.input[0])
b2s_op = get_node_from_map(op_map, biasadd_op.input[0]) if biasadd_op.type == 'QuantizedBiasAdd_8p8to32':
if b2s_op.type == 'QuantizedBatchToSpaceND_8': b2s_op = get_node_from_map(op_map, biasadd_op.input[0])
conv_requantize_op = get_node_from_map(op_map, b2s_op.input[0]) if b2s_op.type == 'QuantizedBatchToSpaceND_8':
conv_op = get_node_from_map(op_map, conv_requantize_op.input[0]) conv_requantize_op = get_node_from_map(
if conv_op.type == 'QuantizedConv2d_8x8to32': op_map, b2s_op.input[0])
new_biasadd_op = mace_pb2.OperatorDef() conv_op = get_node_from_map(op_map,
new_biasadd_op.CopyFrom(biasadd_op) conv_requantize_op.input[0])
new_biasadd_op.input[0] = get_tensor_name_from_op(conv_requantize_op.name, 0) if conv_op.type == 'QuantizedConv2d_8x8to32':
new_biasadd_op.input[2] = get_tensor_name_from_op(conv_requantize_op.name, 1) new_biasadd_op = mace_pb2.OperatorDef()
new_biasadd_op.input[3] = get_tensor_name_from_op(conv_requantize_op.name, 2) new_biasadd_op.CopyFrom(biasadd_op)
new_biasadd_op.out_max_byte_size[0] = conv_requantize_op.out_max_byte_size[0] * 4 new_biasadd_op.input[0] = get_tensor_name_from_op(
conv_requantize_op.name, 0)
new_biasadd_requantize_op = mace_pb2.OperatorDef() new_biasadd_op.input[2] = get_tensor_name_from_op(
new_biasadd_requantize_op.CopyFrom(biasadd_requantize_op) conv_requantize_op.name, 1)
new_biasadd_requantize_op.out_max_byte_size[0] = new_biasadd_op.out_max_byte_size[0] / 4 new_biasadd_op.input[3] = get_tensor_name_from_op(
conv_requantize_op.name, 2)
new_b2s_op = mace_pb2.OperatorDef() new_biasadd_op.out_max_byte_size[
new_b2s_op.CopyFrom(b2s_op) 0] = conv_requantize_op.out_max_byte_size[0] * 4
new_b2s_op.input[0] = get_tensor_name_from_op(biasadd_requantize_op.name, 0)
new_b2s_op.input[3] = get_tensor_name_from_op(biasadd_requantize_op.name, 1) new_biasadd_requantize_op = mace_pb2.OperatorDef()
new_b2s_op.input[4] = get_tensor_name_from_op(biasadd_requantize_op.name, 2) new_biasadd_requantize_op.CopyFrom(
biasadd_requantize_op)
new_ops.extend([new_biasadd_op, new_biasadd_requantize_op, new_b2s_op]) new_biasadd_requantize_op.out_max_byte_size[
skip_ops = skip_ops.union([biasadd_op.name, biasadd_requantize_op.name, b2s_op.name]) 0] = new_biasadd_op.out_max_byte_size[0] / 4
visited_ops.add(op.name)
new_b2s_op = mace_pb2.OperatorDef()
follow_ops = consumers[get_tensor_name_from_op(biasadd_requantize_op.name, 0)] new_b2s_op.CopyFrom(b2s_op)
for follow_op in follow_ops: new_b2s_op.input[0] = get_tensor_name_from_op(
new_follow_op = mace_pb2.OperatorDef() biasadd_requantize_op.name, 0)
new_follow_op.CopyFrom(follow_op) new_b2s_op.input[3] = get_tensor_name_from_op(
for i in xrange(len(follow_op.input)): biasadd_requantize_op.name, 1)
for k in xrange(3): new_b2s_op.input[4] = get_tensor_name_from_op(
if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k): biasadd_requantize_op.name, 2)
new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k)
new_ops.append(new_follow_op) new_ops.extend([
skip_ops.add(follow_op.name) new_biasadd_op, new_biasadd_requantize_op,
visited_ops.add(follow_op.name) new_b2s_op
])
visited_ops.add(op.name) skip_ops = skip_ops.union([
biasadd_op.name, biasadd_requantize_op.name,
new_net_def = mace_pb2.NetDef() b2s_op.name
new_net_def.tensors.extend(tensor_map.values()) ])
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops]) visited_ops.add(op.name)
new_net_def.op.extend(new_ops)
follow_ops = consumers[get_tensor_name_from_op(
return new_net_def biasadd_requantize_op.name, 0)]
for follow_op in follow_ops:
new_follow_op = mace_pb2.OperatorDef()
new_follow_op.CopyFrom(follow_op)
for i in xrange(len(follow_op.input)):
for k in xrange(3):
if new_follow_op.input[
i] == get_tensor_name_from_op(
biasadd_requantize_op.name, k):
new_follow_op.input[
i] = get_tensor_name_from_op(
b2s_op.name, k)
new_ops.append(new_follow_op)
skip_ops.add(follow_op.name)
visited_ops.add(follow_op.name)
visited_ops.add(op.name)
new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend(tensor_map.values())
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops)
return new_net_def
def add_node_id(net_def): def add_node_id(net_def):
node_id_counter = 0 node_id_counter = 0
node_id_map = {} node_id_map = {}
for tensor in net_def.tensors: for tensor in net_def.tensors:
tensor.node_id = node_id_counter tensor.node_id = node_id_counter
node_id_counter += 1 node_id_counter += 1
tensor_op, port = get_op_and_port_from_tensor(tensor.name) tensor_op, port = get_op_and_port_from_tensor(tensor.name)
node_id_map[tensor_op] = tensor.node_id node_id_map[tensor_op] = tensor.node_id
for op in net_def.op: for op in net_def.op:
op.node_id = node_id_counter op.node_id = node_id_counter
node_id_counter += 1 node_id_counter += 1
node_id_map[op.name] = op.node_id node_id_map[op.name] = op.node_id
for ipt in op.input: for ipt in op.input:
op_name, port = get_op_and_port_from_tensor(ipt) op_name, port = get_op_and_port_from_tensor(ipt)
node_id = node_id_map[op_name] node_id = node_id_map[op_name]
node_input = op.node_input.add() node_input = op.node_input.add()
node_input.node_id = node_id node_input.node_id = node_id
node_input.output_port = int(port) node_input.output_port = int(port)
return net_def return net_def
def add_input_output_info(net_def, input_node, output_node, graph, dtype): def add_input_output_info(net_def, input_node, output_node, graph, dtype):
input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0)) input_tensor = graph.get_tensor_by_name(
output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0)) get_tensor_name_from_op(input_node, 0))
output_tensor = graph.get_tensor_by_name(
input_info = net_def.input_info.add() get_tensor_name_from_op(output_node, 0))
input_info.dims.extend(input_tensor.shape.as_list())
input_info.data_type = dtype input_info = net_def.input_info.add()
if dtype == mace_pb2.DT_UINT8: input_info.dims.extend(input_tensor.shape.as_list())
for i in xrange(2): input_info.data_type = dtype
input_info = net_def.input_info.add() if dtype == mace_pb2.DT_UINT8:
input_info.dims.extend([1,1,1,1]) for i in xrange(2):
input_info.data_type = mace_pb2.DT_FLOAT input_info = net_def.input_info.add()
input_info.dims.extend([1, 1, 1, 1])
output_info = net_def.output_info.add() input_info.data_type = mace_pb2.DT_FLOAT
output_info.dims.extend(output_tensor.shape.as_list())
output_info.data_type = dtype output_info = net_def.output_info.add()
if dtype == mace_pb2.DT_UINT8: output_info.dims.extend(output_tensor.shape.as_list())
for i in xrange(2): output_info.data_type = dtype
output_info = net_def.output_info.add() if dtype == mace_pb2.DT_UINT8:
output_info.dims.extend([1,1,1,1]) for i in xrange(2):
output_info.data_type = mace_pb2.DT_FLOAT output_info = net_def.output_info.add()
output_info.dims.extend([1, 1, 1, 1])
return net_def output_info.data_type = mace_pb2.DT_FLOAT
return net_def
def fuse_quantize(net_def, input_node, output_node): def fuse_quantize(net_def, input_node, output_node):
tensor_map = {} tensor_map = {}
for tensor in net_def.tensors: for tensor in net_def.tensors:
tensor_map[tensor.name] = tensor tensor_map[tensor.name] = tensor
op_map = {} op_map = {}
for op in net_def.op: for op in net_def.op:
op_map[op.name] = op op_map[op.name] = op
consumers = {} consumers = {}
for op in net_def.op: for op in net_def.op:
for ipt in op.input: for ipt in op.input:
if ipt not in consumers: if ipt not in consumers:
consumers[ipt] = [] consumers[ipt] = []
consumers[ipt].append(op) consumers[ipt].append(op)
skip_ops = set() skip_ops = set()
new_ops = [] new_ops = []
skip_tensors = set() skip_tensors = set()
# INPUT->Flatten->Minf, Maxf->Quantize # INPUT->Flatten->Minf, Maxf->Quantize
for op in net_def.op: for op in net_def.op:
if op.type == 'INPUT': if op.type == 'INPUT':
input_op = op input_op = op
flatten_op = None flatten_op = None
quantize_op = None quantize_op = None
for o in consumers[get_tensor_name_from_op(input_op.name, 0)]: for o in consumers[get_tensor_name_from_op(input_op.name, 0)]:
if o.type == 'Flatten': if o.type == 'Flatten':
flatten_op = o flatten_op = o
elif o.type == 'Quantize': elif o.type == 'Quantize':
quantize_op = o quantize_op = o
if quantize_op is not None: if quantize_op is not None:
minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)] minf_op, maxf_op = consumers[get_tensor_name_from_op(
skip_ops = skip_ops.union([flatten_op.name, minf_op.name, maxf_op.name]) flatten_op.name, 0)]
skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]]) skip_ops = skip_ops.union(
quantize_op.type = 'AutoQuantize' [flatten_op.name, minf_op.name, maxf_op.name])
del quantize_op.input[1:] skip_tensors = skip_tensors.union(
[flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
new_net_def = mace_pb2.NetDef() quantize_op.type = 'AutoQuantize'
new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors]) del quantize_op.input[1:]
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops) new_net_def = mace_pb2.NetDef()
return new_net_def new_net_def.tensors.extend([
tensor for tensor in net_def.tensors if tensor.name not in skip_tensors
])
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops)
return new_net_def
def convert_to_mace_pb(model_file, input_node, output_node, dsp_mode): def convert_to_mace_pb(model_file, input_node, output_node, dsp_mode):
""" """
nnlib does not have batch norm, so use tensorflow optimizer to fold nnlib does not have batch norm, so use tensorflow optimizer to fold
batch norm with convolution. The fold optimization reorders ops, so batch norm with convolution. The fold optimization reorders ops, so
we sort ops first by topology. we sort ops first by topology.
""" """
input_graph_def = tf.GraphDef() input_graph_def = tf.GraphDef()
with gfile.Open(model_file, "rb") as f: with gfile.Open(model_file, "rb") as f:
data = f.read() data = f.read()
input_graph_def.ParseFromString(data) input_graph_def.ParseFromString(data)
input_graph_def = graph_util.sort_tf_graph(input_graph_def) input_graph_def = graph_util.sort_tf_graph(input_graph_def)
net_def = mace_pb2.NetDef() net_def = mace_pb2.NetDef()
with tf.Session() as session: with tf.Session() as session:
with session.graph.as_default() as graph: with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name="") tf.import_graph_def(input_graph_def, name="")
ops = graph.get_operations() ops = graph.get_operations()
dsp_ops = DspOps() dsp_ops = DspOps()
resolved_ops = set() resolved_ops = set()
# convert const node # convert const node
unresolved_ops = [op for op in ops if op.type == 'Const'] unresolved_ops = [op for op in ops if op.type == 'Const']
while len(unresolved_ops) > 0: while len(unresolved_ops) > 0:
convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops) convert_ops(unresolved_ops, resolved_ops, net_def, output_node,
dsp_ops)
# convert op node
unresolved_ops = [op for op in ops if op.type != 'Const'] # convert op node
while len(unresolved_ops) > 0: unresolved_ops = [op for op in ops if op.type != 'Const']
convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops) while len(unresolved_ops) > 0:
convert_ops(unresolved_ops, resolved_ops, net_def, output_node,
add_output_node(net_def, output_node) dsp_ops)
net_def = reverse_batch_to_space_and_biasadd(net_def)
net_def = fuse_quantize(net_def, input_node, output_node) add_output_node(net_def, output_node)
net_def = reverse_batch_to_space_and_biasadd(net_def)
sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__') net_def = fuse_quantize(net_def, input_node, output_node)
net_def_with_node_id = add_node_id(sorted_net_def)
sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__')
dtype = mace_pb2.DT_FLOAT net_def_with_node_id = add_node_id(sorted_net_def)
final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype)
dtype = mace_pb2.DT_FLOAT
arg = final_net_def.arg.add() final_net_def = add_input_output_info(
arg.name = 'dsp_mode' net_def_with_node_id, input_node, output_node, graph, dtype)
arg.i = dsp_mode
arg = final_net_def.arg.add()
return final_net_def arg.name = 'dsp_mode'
arg.i = dsp_mode
return final_net_def
...@@ -10,148 +10,174 @@ from tensorflow import gfile ...@@ -10,148 +10,174 @@ from tensorflow import gfile
FLAGS = None FLAGS = None
def hist_inc(hist, key): def hist_inc(hist, key):
if key in hist: if key in hist:
hist[key] += 1 hist[key] += 1
else: else:
hist[key] = 1 hist[key] = 1
def to_int_list(long_list): def to_int_list(long_list):
int_list = [] int_list = []
for value in long_list: for value in long_list:
int_list.append(int(value)) int_list.append(int(value))
return int_list return int_list
def main(unused_args): def main(unused_args):
if not FLAGS.input or not gfile.Exists(FLAGS.input): if not FLAGS.input or not gfile.Exists(FLAGS.input):
print('Input graph file ' + FLAGS.input + ' does not exist!') print('Input graph file ' + FLAGS.input + ' does not exist!')
return -1 return -1
input_graph_def = tf.GraphDef() input_graph_def = tf.GraphDef()
with gfile.Open(FLAGS.input, 'rb') as f: with gfile.Open(FLAGS.input, 'rb') as f:
data = f.read() data = f.read()
input_graph_def.ParseFromString(data) input_graph_def.ParseFromString(data)
with tf.Session() as session: with tf.Session() as session:
with session.graph.as_default() as graph: with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name='') tf.import_graph_def(input_graph_def, name='')
stats = {} stats = {}
ops = graph.get_operations() ops = graph.get_operations()
# extract kernel size for conv_2d # extract kernel size for conv_2d
tensor_shapes = {} tensor_shapes = {}
tensor_values = {} tensor_values = {}
print("=========================consts============================") print("=========================consts============================")
for op in ops: for op in ops:
if op.type == 'Const': if op.type == 'Const':
for output in op.outputs: for output in op.outputs:
tensor_name = output.name tensor_name = output.name
tensor = output.eval() tensor = output.eval()
tensor_shape = list(tensor.shape) tensor_shape = list(tensor.shape)
tensor_shapes[tensor_name] = tensor_shape tensor_shapes[tensor_name] = tensor_shape
print("Const %s: %s, %d" % (tensor_name, tensor_shape, functools.reduce(operator.mul, tensor_shape, 1))) print("Const %s: %s, %d" %
if len(tensor_shape) == 1 and tensor_shape[0] < 10: (tensor_name, tensor_shape,
tensor_values[tensor_name] = list(tensor) functools.reduce(operator.mul, tensor_shape, 1)))
if len(tensor_shape) == 1 and tensor_shape[0] < 10:
print("=========================ops============================") tensor_values[tensor_name] = list(tensor)
for op in ops:
if op.type in ['Conv2D']: print("=========================ops============================")
padding = op.get_attr('padding') for op in ops:
strides = to_int_list(op.get_attr('strides')) if op.type in ['Conv2D']:
data_format = op.get_attr('data_format') padding = op.get_attr('padding')
ksize = 'Unknown' strides = to_int_list(op.get_attr('strides'))
for input in op.inputs: data_format = op.get_attr('data_format')
input_name = input.name ksize = 'Unknown'
if input_name.endswith('weights/read:0'): for input in op.inputs:
ksize = input.shape.as_list() input_name = input.name
break if input_name.endswith('weights/read:0'):
if input_name.endswith('weights:0') and input_name in tensor_shapes: ksize = input.shape.as_list()
ksize = tensor_shapes[input_name] break
break if input_name.endswith(
print('%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' % (op.type, padding, strides, ksize, data_format, op.inputs[0].shape, op.outputs[0].shape)) 'weights:0') and input_name in tensor_shapes:
key = '%s(padding=%s, strides=%s, ksize=%s, format=%s)' % (op.type, padding, strides, ksize, data_format) ksize = tensor_shapes[input_name]
hist_inc(stats, key) break
elif op.type in ['FusedResizeAndPadConv2D']: print(
padding = op.get_attr('padding') '%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s'
strides = to_int_list(op.get_attr('strides')) % (op.type, padding, strides, ksize, data_format,
resize_align_corners = op.get_attr('resize_align_corners') op.inputs[0].shape, op.outputs[0].shape))
ksize = 'Unknown' key = '%s(padding=%s, strides=%s, ksize=%s, format=%s)' % (
for input in op.inputs: op.type, padding, strides, ksize, data_format)
input_name = input.name hist_inc(stats, key)
if input_name.endswith('weights:0') and input_name in tensor_shapes: elif op.type in ['FusedResizeAndPadConv2D']:
ksize = tensor_shapes[input_name] padding = op.get_attr('padding')
break strides = to_int_list(op.get_attr('strides'))
key = '%s(padding=%s, strides=%s, ksize=%s, resize_align_corners=%s)' % (op.type, padding, strides, ksize, resize_align_corners) resize_align_corners = op.get_attr('resize_align_corners')
hist_inc(stats, key) ksize = 'Unknown'
elif op.type in ['ResizeBilinear']: for input in op.inputs:
align_corners = op.get_attr('align_corners') input_name = input.name
size = 'Unknown' if input_name.endswith(
for input in op.inputs: 'weights:0') and input_name in tensor_shapes:
input_name = input.name ksize = tensor_shapes[input_name]
if input_name.endswith('size:0') and input_name in tensor_values: break
size = tensor_values[input_name] key = '%s(padding=%s, strides=%s, ksize=%s, ' \
break 'resize_align_corners=%s)' % (op.type, padding, strides,
key = '%s(size=%s, align_corners=%s)' % (op.type, size, align_corners) ksize, resize_align_corners)
print(key) hist_inc(stats, key)
hist_inc(stats, key) elif op.type in ['ResizeBilinear']:
elif op.type in ['AvgPool', 'MaxPool']: align_corners = op.get_attr('align_corners')
padding = op.get_attr('padding') size = 'Unknown'
strides = to_int_list(op.get_attr('strides')) for input in op.inputs:
ksize = to_int_list(op.get_attr('ksize')) input_name = input.name
data_format = op.get_attr('data_format') if input_name.endswith(
key = '%s(padding=%s, strides=%s, ksize=%s)' % (op.type, padding, strides, ksize) 'size:0') and input_name in tensor_values:
hist_inc(stats, key) size = tensor_values[input_name]
elif op.type in ['SpaceToBatchND', 'BatchToSpaceND']: break
block_shape = 'Unknown' key = '%s(size=%s, align_corners=%s)' % (op.type, size,
for input in op.inputs: align_corners)
input_name = input.name print(key)
if input_name.endswith('block_shape:0') and input_name in tensor_values: hist_inc(stats, key)
block_shape = tensor_values[input_name] elif op.type in ['AvgPool', 'MaxPool']:
break padding = op.get_attr('padding')
paddings = 'Unknown' strides = to_int_list(op.get_attr('strides'))
for input in op.inputs: ksize = to_int_list(op.get_attr('ksize'))
input_name = input.name data_format = op.get_attr('data_format')
if input_name.endswith('paddings:0') and input_name in tensor_values: key = '%s(padding=%s, strides=%s, ksize=%s)' % (op.type,
paddings = tensor_values[input_name] padding,
break strides, ksize)
crops = 'Unknown' hist_inc(stats, key)
for input in op.inputs: elif op.type in ['SpaceToBatchND', 'BatchToSpaceND']:
input_name = input.name block_shape = 'Unknown'
if input_name.endswith('crops:0') and input_name in tensor_values: for input in op.inputs:
paddings = tensor_values[input_name] input_name = input.name
break if input_name.endswith(
if op.type == 'SpaceToBatchND': 'block_shape:0') and input_name in tensor_values:
key = '%s(block_shape=%s, paddings=%s)' % (op.type, block_shape, paddings) block_shape = tensor_values[input_name]
else: break
key = '%s(block_shape=%s, crops=%s)' % (op.type, block_shape, crops) paddings = 'Unknown'
print(key) for input in op.inputs:
hist_inc(stats, key) input_name = input.name
elif op.type == 'Pad': if input_name.endswith(
paddings = 'Unknown' 'paddings:0') and input_name in tensor_values:
for input in op.inputs: paddings = tensor_values[input_name]
input_name = input.name break
if input_name.endswith('paddings:0') and input_name in tensor_values: crops = 'Unknown'
paddings = tensor_values[input_name] for input in op.inputs:
break input_name = input.name
key = '%s(paddings=%s)' % (op.type, paddings) if input_name.endswith(
hist_inc(stats, key) 'crops:0') and input_name in tensor_values:
else: paddings = tensor_values[input_name]
hist_inc(stats, op.type) break
if op.type == 'SpaceToBatchND':
print("=========================stats============================") key = '%s(block_shape=%s, paddings=%s)' % (op.type,
for key, value in sorted(six.iteritems(stats)): block_shape,
print('%s: %d' % (key, value)) paddings)
else:
key = '%s(block_shape=%s, crops=%s)' % (op.type,
block_shape, crops)
print(key)
hist_inc(stats, key)
elif op.type == 'Pad':
paddings = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith(
'paddings:0') and input_name in tensor_values:
paddings = tensor_values[input_name]
break
key = '%s(paddings=%s)' % (op.type, paddings)
hist_inc(stats, key)
else:
hist_inc(stats, op.type)
print("=========================stats============================")
for key, value in sorted(six.iteritems(stats)):
print('%s: %d' % (key, value))
def parse_args(): def parse_args():
'''Parses command line arguments.''' '''Parses command line arguments.'''
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
'--input', '--input',
type=str, type=str,
default='', default='',
help='TensorFlow \'GraphDef\' file to load.') help='TensorFlow \'GraphDef\' file to load.')
return parser.parse_known_args() return parser.parse_known_args()
if __name__ == '__main__': if __name__ == '__main__':
FLAGS, unparsed = parse_args() FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed) main(unused_args=[sys.argv[0]] + unparsed)
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
# --target=//mace/ops:ops_test # --target=//mace/ops:ops_test
# --stdout_processor=stdout_processor # --stdout_processor=stdout_processor
import argparse import argparse
import random import random
import re import re
...@@ -15,104 +14,113 @@ import sys ...@@ -15,104 +14,113 @@ import sys
import sh_commands import sh_commands
def stdout_processor(stdout, device_properties, abi): def stdout_processor(stdout, device_properties, abi):
pass pass
def ops_test_stdout_processor(stdout, device_properties, abi): def ops_test_stdout_processor(stdout, device_properties, abi):
stdout_lines = stdout.split("\n") stdout_lines = stdout.split("\n")
for line in stdout_lines: for line in stdout_lines:
if "Aborted" in line or "FAILED" in line: if "Aborted" in line or "FAILED" in line:
raise Exception("Command failed") raise Exception("Command failed")
def ops_benchmark_stdout_processor(stdout, device_properties, abi): def ops_benchmark_stdout_processor(stdout, device_properties, abi):
stdout_lines = stdout.split("\n") stdout_lines = stdout.split("\n")
metrics = {} metrics = {}
for line in stdout_lines: for line in stdout_lines:
if "Aborted" in line: if "Aborted" in line:
raise Exception("Command failed") raise Exception("Command failed")
line = line.strip() line = line.strip()
parts = line.split() parts = line.split()
if len(parts) == 5 and parts[0].startswith("BM_"): if len(parts) == 5 and parts[0].startswith("BM_"):
metrics["%s.time_ms" % parts[0]] = str(float(parts[1])/1e6) metrics["%s.time_ms" % parts[0]] = str(float(parts[1]) / 1e6)
metrics["%s.input_mb_per_sec" % parts[0]] = parts[3] metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
metrics["%s.gmacc_per_sec" % parts[0]] = parts[4] metrics["%s.gmacc_per_sec" % parts[0]] = parts[4]
platform = device_properties["ro.board.platform"].replace(" ", "-") platform = device_properties["ro.board.platform"].replace(" ", "-")
model = device_properties["ro.product.model"].replace(" ", "-") model = device_properties["ro.product.model"].replace(" ", "-")
tags = {"ro.board.platform": platform, tags = {
"ro.product.model": model, "ro.board.platform": platform,
"abi": abi} "ro.product.model": model,
sh_commands.falcon_push_metrics(metrics, tags=tags, "abi": abi
endpoint="mace_ops_benchmark") }
sh_commands.falcon_push_metrics(
metrics, tags=tags, endpoint="mace_ops_benchmark")
def parse_args(): def parse_args():
"""Parses command line arguments.""" """Parses command line arguments."""
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
"--target_abis", "--target_abis",
type=str, type=str,
default="armeabi-v7a", default="armeabi-v7a",
help="Target ABIs, comma seperated list") help="Target ABIs, comma seperated list")
parser.add_argument( parser.add_argument(
"--target_socs", "--target_socs",
type=str, type=str,
default="all", default="all",
help="SoCs(ro.board.platform) to build, comma seperated list or all/random") help="SoCs (ro.board.platform from getprop) to build, "
parser.add_argument( "comma seperated list or all/random")
"--target", parser.add_argument(
type=str, "--target", type=str, default="//...", help="Bazel target to build")
default="//...", parser.add_argument(
help="Bazel target to build") "--run_target",
parser.add_argument( type=bool,
"--run_target", default=False,
type=bool, help="Whether to run the target")
default=False, parser.add_argument("--args", type=str, default="", help="Command args")
help="Whether to run the target") parser.add_argument(
parser.add_argument( "--stdout_processor",
"--args", type=str,
type=str, default="stdout_processor",
default="", help="Stdout processing function, default: stdout_processor")
help="Command args") return parser.parse_known_args()
parser.add_argument(
"--stdout_processor",
type=str,
default="stdout_processor",
help="Stdout processing function, default: stdout_processor")
return parser.parse_known_args()
def main(unused_args): def main(unused_args):
target_socs = None target_socs = None
if FLAGS.target_socs != "all" and FLAGS.target_socs != "random": if FLAGS.target_socs != "all" and FLAGS.target_socs != "random":
target_socs = set(FLAGS.target_socs.split(',')) target_socs = set(FLAGS.target_socs.split(','))
target_devices = sh_commands.adb_devices(target_socs=target_socs) target_devices = sh_commands.adb_devices(target_socs=target_socs)
if FLAGS.target_socs == "random": if FLAGS.target_socs == "random":
target_devices = [random.choice(target_devices)] target_devices = [random.choice(target_devices)]
target = FLAGS.target target = FLAGS.target
host_bin_path, bin_name = sh_commands.bazel_target_to_bin(target) host_bin_path, bin_name = sh_commands.bazel_target_to_bin(target)
target_abis = FLAGS.target_abis.split(',') target_abis = FLAGS.target_abis.split(',')
# generate sources # generate sources
sh_commands.gen_encrypted_opencl_source() sh_commands.gen_encrypted_opencl_source()
sh_commands.gen_compiled_opencl_source() sh_commands.gen_compiled_opencl_source()
sh_commands.gen_mace_version() sh_commands.gen_mace_version()
for target_abi in target_abis: for target_abi in target_abis:
sh_commands.bazel_build(target, abi=target_abi) sh_commands.bazel_build(target, abi=target_abi)
if FLAGS.run_target: if FLAGS.run_target:
for serialno in target_devices: for serialno in target_devices:
if target_abi not in set(sh_commands.adb_supported_abis(serialno)): if target_abi not in set(
print("Skip device %s which does not support ABI %s" % (serialno, target_abi)) sh_commands.adb_supported_abis(serialno)):
continue print("Skip device %s which does not support ABI %s" %
stdouts = sh_commands.adb_run(serialno, host_bin_path, bin_name, (serialno, target_abi))
args=FLAGS.args, continue
opencl_profiling=1, stdouts = sh_commands.adb_run(
vlog_level=0, serialno,
device_bin_path="/data/local/tmp/mace", host_bin_path,
out_of_range_check=1) bin_name,
device_properties = sh_commands.adb_getprop_by_serialno(serialno) args=FLAGS.args,
globals()[FLAGS.stdout_processor](stdouts, device_properties, target_abi) opencl_profiling=1,
vlog_level=0,
device_bin_path="/data/local/tmp/mace",
out_of_range_check=1)
device_properties = sh_commands.adb_getprop_by_serialno(
serialno)
globals()[FLAGS.stdout_processor](stdouts, device_properties,
target_abi)
if __name__ == "__main__": if __name__ == "__main__":
FLAGS, unparsed = parse_args() FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed) main(unused_args=[sys.argv[0]] + unparsed)
#-*- coding:utf8 -*- import json
import socket
import itertools
import json, socket, itertools
class FalconCli(object): class FalconCli(object):
def __init__(self, addr, debug=True, buf_size=1000): def __init__(self, addr, debug=True, buf_size=1000):
self.socket_ = socket.create_connection(addr) self.socket_ = socket.create_connection(addr)
self.stream = self.socket_.makefile() self.stream = self.socket_.makefile()
...@@ -16,16 +16,19 @@ class FalconCli(object): ...@@ -16,16 +16,19 @@ class FalconCli(object):
self.stream.close() self.stream.close()
@classmethod @classmethod
def connect(cls, server="transfer.falcon.miliao.srv", port=8433, debug=True, buf_size=1000): def connect(cls,
server="transfer.falcon.miliao.srv",
port=8433,
debug=True,
buf_size=1000):
try: try:
return FalconCli((server, port), debug, buf_size) return FalconCli((server, port), debug, buf_size)
except socket.error, exc: except socket.error, exc:
print "error: connect to %s:%s error: %s" %(server, port, exc) print "error: connect to %s:%s error: %s" % (server, port, exc)
def call(self, name, *params): def call(self, name, *params):
request = dict(id=next(self.id_counter), request = dict(
params=list(params), id=next(self.id_counter), params=list(params), method=name)
method=name)
payload = json.dumps(request).encode() payload = json.dumps(request).encode()
if self.debug: if self.debug:
print "--> req:", payload print "--> req:", payload
...@@ -49,7 +52,7 @@ class FalconCli(object): ...@@ -49,7 +52,7 @@ class FalconCli(object):
resp = [] resp = []
while True: while True:
buf = lines[s:s+self.buf_size] buf = lines[s:s + self.buf_size]
s = s + self.buf_size s = s + self.buf_size
if len(buf) == 0: if len(buf) == 0:
break break
...@@ -57,4 +60,3 @@ class FalconCli(object): ...@@ -57,4 +60,3 @@ class FalconCli(object):
resp.append(r) resp.append(r)
return resp return resp
...@@ -11,45 +11,40 @@ import re ...@@ -11,45 +11,40 @@ import re
# --input_file input_file # --input_file input_file
# #
def generate_data(name, shape): def generate_data(name, shape):
np.random.seed() np.random.seed()
data = np.random.random(shape) * 2 - 1 data = np.random.random(shape) * 2 - 1
input_file_name = FLAGS.input_file + "_" + re.sub('[^0-9a-zA-Z]+', '_', name) input_file_name = FLAGS.input_file + "_" + re.sub('[^0-9a-zA-Z]+', '_',
print 'Generate input file: ', input_file_name name)
data.astype(np.float32).tofile(input_file_name) print 'Generate input file: ', input_file_name
data.astype(np.float32).tofile(input_file_name)
def main(unused_args): def main(unused_args):
input_names = [name for name in FLAGS.input_node.split(',')] input_names = [name for name in FLAGS.input_node.split(',')]
input_shapes = [shape for shape in FLAGS.input_shape.split(':')] input_shapes = [shape for shape in FLAGS.input_shape.split(':')]
assert len(input_names) == len(input_shapes) assert len(input_names) == len(input_shapes)
for i in range(len(input_names)): for i in range(len(input_names)):
shape = [int(x) for x in input_shapes[i].split(',')] shape = [int(x) for x in input_shapes[i].split(',')]
generate_data(input_names[i], shape) generate_data(input_names[i], shape)
print "Generate input file done." print "Generate input file done."
def parse_args(): def parse_args():
"""Parses command line arguments.""" """Parses command line arguments."""
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true") parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument( parser.add_argument(
"--input_file", "--input_file", type=str, default="", help="input file.")
type=str, parser.add_argument(
default="", "--input_node", type=str, default="input_node", help="input node")
help="input file.") parser.add_argument(
parser.add_argument( "--input_shape", type=str, default="1,64,64,3", help="input shape.")
"--input_node",
type=str, return parser.parse_known_args()
default="input_node",
help="input node")
parser.add_argument(
"--input_shape",
type=str,
default="1,64,64,3",
help="input shape.")
return parser.parse_known_args()
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
...@@ -23,124 +23,135 @@ from ConfigParser import ConfigParser ...@@ -23,124 +23,135 @@ from ConfigParser import ConfigParser
def run_command(command): def run_command(command):
print("Run command: {}".format(command)) print("Run command: {}".format(command))
result = subprocess.Popen( result = subprocess.Popen(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = result.communicate() out, err = result.communicate()
if out: if out:
print("Stdout msg:\n{}".format(out)) print("Stdout msg:\n{}".format(out))
if err: if err:
print("Stderr msg:\n{}".format(err)) print("Stderr msg:\n{}".format(err))
if result.returncode != 0: if result.returncode != 0:
raise Exception("Exit not 0 from bash with code: {}, command: {}".format( raise Exception(
result.returncode, command)) "Exit not 0 from bash with code: {}, command: {}".format(
result.returncode, command))
def get_global_runtime(configs): def get_global_runtime(configs):
runtime_list = [] runtime_list = []
for model_name in configs["models"]: for model_name in configs["models"]:
model_runtime = configs["models"][model_name]["runtime"] model_runtime = configs["models"][model_name]["runtime"]
runtime_list.append(model_runtime.lower()) runtime_list.append(model_runtime.lower())
global_runtime = "" global_runtime = ""
if "dsp" in runtime_list: if "dsp" in runtime_list:
global_runtime = "dsp" global_runtime = "dsp"
elif "gpu" in runtime_list: elif "gpu" in runtime_list:
global_runtime = "gpu" global_runtime = "gpu"
elif "cpu" in runtime_list: elif "cpu" in runtime_list:
global_runtime = "cpu" global_runtime = "cpu"
elif "neon" in runtime_list: elif "neon" in runtime_list:
global_runtime = "neon" global_runtime = "neon"
else: else:
raise Exception("Not found available RUNTIME in config files!") raise Exception("Not found available RUNTIME in config files!")
return global_runtime return global_runtime
def generate_version_code(): def generate_version_code():
command = "bash tools/generate_version_code.sh" command = "bash tools/generate_version_code.sh"
run_command(command) run_command(command)
def generate_opencl_source_code(): def generate_opencl_source_code():
command = "bash tools/generate_opencl_code.sh source" command = "bash tools/generate_opencl_code.sh source"
run_command(command) run_command(command)
def generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not): def generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not):
cl_bin_dirs = [] cl_bin_dirs = []
for d in model_output_dirs: for d in model_output_dirs:
cl_bin_dirs.append(os.path.join(d, "opencl_bin")) cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
cl_bin_dirs_str = ",".join(cl_bin_dirs) cl_bin_dirs_str = ",".join(cl_bin_dirs)
if not cl_bin_dirs: if not cl_bin_dirs:
command = "bash tools/generate_opencl_code.sh binary" command = "bash tools/generate_opencl_code.sh binary"
else: else:
command = "bash tools/generate_opencl_code.sh {} {} {} {}".format( command = "bash tools/generate_opencl_code.sh {} {} {} {}".format(
'binary', target_soc, cl_bin_dirs_str, int(pull_or_not)) 'binary', target_soc, cl_bin_dirs_str, int(pull_or_not))
run_command(command) run_command(command)
def generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not): def generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not):
cl_bin_dirs = [] cl_bin_dirs = []
for d in model_output_dirs: for d in model_output_dirs:
cl_bin_dirs.append(os.path.join(d, "opencl_bin")) cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
cl_bin_dirs_str = ",".join(cl_bin_dirs) cl_bin_dirs_str = ",".join(cl_bin_dirs)
if not cl_bin_dirs: if not cl_bin_dirs:
command = "bash tools/generate_tuning_param_code.sh" command = "bash tools/generate_tuning_param_code.sh"
else: else:
command = "bash tools/generate_tuning_param_code.sh {} {} {}".format( command = "bash tools/generate_tuning_param_code.sh {} {} {}".format(
target_soc, cl_bin_dirs_str, int(pull_or_not)) target_soc, cl_bin_dirs_str, int(pull_or_not))
run_command(command) run_command(command)
def generate_code(target_soc, model_output_dirs, pull_or_not): def generate_code(target_soc, model_output_dirs, pull_or_not):
generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not) generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not)
generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not) generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not)
def clear_env(target_soc): def clear_env(target_soc):
command = "bash tools/clear_env.sh {}".format(target_soc) command = "bash tools/clear_env.sh {}".format(target_soc)
run_command(command) run_command(command)
def input_file_name(input_name): def input_file_name(input_name):
return os.environ['INPUT_FILE_NAME'] + '_' + \ return os.environ['INPUT_FILE_NAME'] + '_' + \
re.sub('[^0-9a-zA-Z]+', '_', input_name) re.sub('[^0-9a-zA-Z]+', '_', input_name)
def generate_random_input(target_soc, model_output_dir,
input_names, input_files): def generate_random_input(target_soc, model_output_dir, input_names,
generate_data_or_not = True input_files):
command = "bash tools/validate_tools.sh {} {} {}".format( generate_data_or_not = True
target_soc, model_output_dir, int(generate_data_or_not)) command = "bash tools/validate_tools.sh {} {} {}".format(
run_command(command) target_soc, model_output_dir, int(generate_data_or_not))
run_command(command)
input_file_list = []
if isinstance(input_files, list): input_file_list = []
input_file_list.extend(input_files) if isinstance(input_files, list):
else: input_file_list.extend(input_files)
input_file_list.append(input_files)
if len(input_file_list) != 0:
input_name_list = []
if isinstance(input_names, list):
input_name_list.extend(input_names)
else: else:
input_name_list.append(input_names) input_file_list.append(input_files)
if len(input_file_list) != len(input_name_list): if len(input_file_list) != 0:
raise Exception('If input_files set, the input files should match the input names.') input_name_list = []
for i in range(len(input_file_list)): if isinstance(input_names, list):
if input_file_list[i] is not None: input_name_list.extend(input_names)
dst_input_file = model_output_dir + '/' + input_file_name(input_name_list[i])
if input_file_list[i].startswith("http://") or \
input_file_list[i].startswith("https://"):
urllib.urlretrieve(input_file_list[i], dst_input_file)
else: else:
shutil.copy(input_file_list[i], dst_input_file) input_name_list.append(input_names)
if len(input_file_list) != len(input_name_list):
raise Exception('If input_files set, the input files should '
'match the input names.')
for i in range(len(input_file_list)):
if input_file_list[i] is not None:
dst_input_file = model_output_dir + '/' + input_file_name(
input_name_list[i])
if input_file_list[i].startswith("http://") or \
input_file_list[i].startswith("https://"):
urllib.urlretrieve(input_file_list[i], dst_input_file)
else:
shutil.copy(input_file_list[i], dst_input_file)
def generate_model_code(): def generate_model_code():
command = "bash tools/generate_model_code.sh" command = "bash tools/generate_model_code.sh"
run_command(command) run_command(command)
def build_mace_run(production_mode, model_output_dir, hexagon_mode): def build_mace_run(production_mode, model_output_dir, hexagon_mode):
command = "bash tools/build_mace_run.sh {} {} {}".format( command = "bash tools/build_mace_run.sh {} {} {}".format(
int(production_mode), model_output_dir, int(hexagon_mode)) int(production_mode), model_output_dir, int(hexagon_mode))
run_command(command) run_command(command)
def tuning_run(model_name, def tuning_run(model_name,
...@@ -152,301 +163,328 @@ def tuning_run(model_name, ...@@ -152,301 +163,328 @@ def tuning_run(model_name,
tuning, tuning,
restart_round, restart_round,
option_args=''): option_args=''):
# TODO(yejianwu) refactoring the hackish code # TODO(yejianwu) refactoring the hackish code
stdout_buff = [] stdout_buff = []
process_output = sh_commands.make_output_processor(stdout_buff) process_output = sh_commands.make_output_processor(stdout_buff)
p = sh.bash("tools/tuning_run.sh", target_soc, model_output_dir, p = sh.bash(
running_round, int(tuning), "tools/tuning_run.sh",
restart_round, option_args, _out=process_output, target_soc,
_bg=True, _err_to_out=True) model_output_dir,
p.wait() running_round,
metrics = {} int(tuning),
for line in stdout_buff: restart_round,
line = line.strip() option_args,
parts = line.split() _out=process_output,
if len(parts) == 6 and parts[0].startswith("time"): _bg=True,
metrics["%s.create_net_ms" % model_name] = str(float(parts[1])) _err_to_out=True)
metrics["%s.mace_engine_ctor_ms" % model_name] = str(float(parts[2])) p.wait()
metrics["%s.init_ms" % model_name] = str(float(parts[3])) metrics = {}
metrics["%s.warmup_ms" % model_name] = str(float(parts[4])) for line in stdout_buff:
if float(parts[5]) > 0: line = line.strip()
metrics["%s.avg_latency_ms" % model_name] = str(float(parts[5])) parts = line.split()
tags = {"ro.board.platform": target_soc, if len(parts) == 6 and parts[0].startswith("time"):
"abi": target_abi, metrics["%s.create_net_ms" % model_name] = str(float(parts[1]))
# "runtime": target_runtime, # TODO(yejianwu) Add the actual runtime metrics["%s.mace_engine_ctor_ms" % model_name] = str(
"round": running_round, # TODO(yejianwu) change this to source/binary float(parts[2]))
"tuning": tuning} metrics["%s.init_ms" % model_name] = str(float(parts[3]))
sh_commands.falcon_push_metrics(metrics, endpoint="mace_model_benchmark", metrics["%s.warmup_ms" % model_name] = str(float(parts[4]))
tags=tags) if float(parts[5]) > 0:
metrics["%s.avg_latency_ms" % model_name] = str(
float(parts[5]))
tags = {
"ro.board.platform": target_soc,
"abi": target_abi,
# "runtime": target_runtime, # TODO(yejianwu) Add the actual runtime
"round": running_round, # TODO(yejianwu) change this to source/binary
"tuning": tuning
}
sh_commands.falcon_push_metrics(
metrics, endpoint="mace_model_benchmark", tags=tags)
def benchmark_model(target_soc, model_output_dir, option_args=''): def benchmark_model(target_soc, model_output_dir, option_args=''):
command = "bash tools/benchmark.sh {} {} \"{}\"".format( command = "bash tools/benchmark.sh {} {} \"{}\"".format(
target_soc, model_output_dir, option_args) target_soc, model_output_dir, option_args)
run_command(command) run_command(command)
def run_model(model_name, target_runtime, target_abi, target_soc, def run_model(model_name, target_runtime, target_abi, target_soc,
model_output_dir, running_round, restart_round, option_args): model_output_dir, running_round, restart_round, option_args):
tuning_run(model_name, target_runtime, target_abi, target_soc, tuning_run(model_name, target_runtime, target_abi, target_soc,
model_output_dir, running_round, False, model_output_dir, running_round, False, restart_round,
restart_round, option_args) option_args)
def generate_production_code(target_soc, model_output_dirs, pull_or_not): def generate_production_code(target_soc, model_output_dirs, pull_or_not):
cl_bin_dirs = [] cl_bin_dirs = []
for d in model_output_dirs: for d in model_output_dirs:
cl_bin_dirs.append(os.path.join(d, "opencl_bin")) cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
cl_bin_dirs_str = ",".join(cl_bin_dirs) cl_bin_dirs_str = ",".join(cl_bin_dirs)
command = "bash tools/generate_production_code.sh {} {} {}".format( command = "bash tools/generate_production_code.sh {} {} {}".format(
target_soc, cl_bin_dirs_str, int(pull_or_not)) target_soc, cl_bin_dirs_str, int(pull_or_not))
run_command(command) run_command(command)
def build_mace_run_prod(model_name, target_runtime, target_abi, target_soc, def build_mace_run_prod(model_name, target_runtime, target_abi, target_soc,
model_output_dir, tuning): model_output_dir, tuning):
if "dsp" == target_runtime: if "dsp" == target_runtime:
hexagon_mode = True hexagon_mode = True
else: else:
hexagon_mode = False hexagon_mode = False
generate_code(target_soc, [], False) generate_code(target_soc, [], False)
production_or_not = False production_or_not = False
build_mace_run(production_or_not, model_output_dir, hexagon_mode) build_mace_run(production_or_not, model_output_dir, hexagon_mode)
tuning_run( tuning_run(
model_name, model_name,
target_runtime, target_runtime,
target_abi, target_abi,
target_soc, target_soc,
model_output_dir, model_output_dir,
running_round=0, running_round=0,
tuning=tuning, tuning=tuning,
restart_round=1) restart_round=1)
generate_code(target_soc, [model_output_dir], True) generate_code(target_soc, [model_output_dir], True)
production_or_not = True production_or_not = True
build_mace_run(production_or_not, model_output_dir, hexagon_mode) build_mace_run(production_or_not, model_output_dir, hexagon_mode)
def build_run_throughput_test(target_soc, run_seconds, merged_lib_file, def build_run_throughput_test(target_soc, run_seconds, merged_lib_file,
model_input_dir): model_input_dir):
command = "bash tools/build_run_throughput_test.sh {} {} {} {}".format( command = "bash tools/build_run_throughput_test.sh {} {} {} {}".format(
target_soc, run_seconds, merged_lib_file, model_input_dir) target_soc, run_seconds, merged_lib_file, model_input_dir)
run_command(command) run_command(command)
def validate_model(target_soc, model_output_dir): def validate_model(target_soc, model_output_dir):
generate_data_or_not = False generate_data_or_not = False
command = "bash tools/validate_tools.sh {} {} {}".format( command = "bash tools/validate_tools.sh {} {} {}".format(
target_soc, model_output_dir, int(generate_data_or_not)) target_soc, model_output_dir, int(generate_data_or_not))
run_command(command) run_command(command)
def build_production_code(): def build_production_code():
command = "bash tools/build_production_code.sh" command = "bash tools/build_production_code.sh"
run_command(command) run_command(command)
def merge_libs_and_tuning_results(target_soc, output_dir, model_output_dirs): def merge_libs_and_tuning_results(target_soc, output_dir, model_output_dirs):
generate_code(target_soc, model_output_dirs, False) generate_code(target_soc, model_output_dirs, False)
build_production_code() build_production_code()
model_output_dirs_str = ",".join(model_output_dirs) model_output_dirs_str = ",".join(model_output_dirs)
command = "bash tools/merge_libs.sh {} {} {}".format(target_soc, output_dir, command = "bash tools/merge_libs.sh {} {} {}".format(
model_output_dirs_str) target_soc, output_dir, model_output_dirs_str)
run_command(command) run_command(command)
def packaging_lib_file(output_dir): def packaging_lib_file(output_dir):
command = "bash tools/packaging_lib.sh {}".format(output_dir) command = "bash tools/packaging_lib.sh {}".format(output_dir)
run_command(command) run_command(command)
def download_model_files(model_file_path, def download_model_files(model_file_path,
model_output_dir, model_output_dir,
weight_file_path=""): weight_file_path=""):
if model_file_path.startswith("http://") or \ if model_file_path.startswith("http://") or \
model_file_path.startswith("https://"): model_file_path.startswith("https://"):
os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb" os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb"
urllib.urlretrieve(model_file_path, os.environ["MODEL_FILE_PATH"]) urllib.urlretrieve(model_file_path, os.environ["MODEL_FILE_PATH"])
if weight_file_path.startswith("http://") or \ if weight_file_path.startswith("http://") or \
weight_file_path.startswith("https://"): weight_file_path.startswith("https://"):
os.environ[ os.environ["WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel"
"WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel" urllib.urlretrieve(weight_file_path, os.environ["WEIGHT_FILE_PATH"])
urllib.urlretrieve(weight_file_path,
os.environ["WEIGHT_FILE_PATH"])
def md5sum(str): def md5sum(str):
md5 = hashlib.md5() md5 = hashlib.md5()
md5.update(str) md5.update(str)
return md5.hexdigest() return md5.hexdigest()
def parse_model_configs(): def parse_model_configs():
with open(FLAGS.config) as f: with open(FLAGS.config) as f:
configs = yaml.load(f) configs = yaml.load(f)
return configs return configs
def parse_args(): def parse_args():
"""Parses command line arguments.""" """Parses command line arguments."""
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true") parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument( parser.add_argument(
"--config", "--config",
type=str, type=str,
default="./tool/config", default="./tool/config",
help="The global config file of models.") help="The global config file of models.")
parser.add_argument( parser.add_argument(
"--output_dir", type=str, default="build", help="The output dir.") "--output_dir", type=str, default="build", help="The output dir.")
parser.add_argument( parser.add_argument(
"--round", type=int, default=1, help="The model running round.") "--round", type=int, default=1, help="The model running round.")
parser.add_argument( parser.add_argument(
"--run_seconds", "--run_seconds",
type=int, type=int,
default=10, default=10,
help="The model throughput test running seconds.") help="The model throughput test running seconds.")
parser.add_argument( parser.add_argument(
"--restart_round", type=int, default=1, help="The model restart round.") "--restart_round",
parser.add_argument( type=int,
"--tuning", type="bool", default="true", help="Tune opencl params.") default=1,
parser.add_argument( help="The model restart round.")
"--mode", parser.add_argument(
type=str, "--tuning", type="bool", default="true", help="Tune opencl params.")
default="all", parser.add_argument(
help="[build|run|validate|merge|all|throughput_test].") "--mode",
parser.add_argument( type=str,
"--target_socs", default="all",
type=str, help="[build|run|validate|merge|all|throughput_test].")
default="all", parser.add_argument(
help="SoCs to build, comma seperated list (getprop ro.board.platform)") "--target_socs",
return parser.parse_known_args() type=str,
default="all",
help="SoCs to build, comma seperated list (getprop ro.board.platform)")
return parser.parse_known_args()
def set_environment(configs): def set_environment(configs):
os.environ["EMBED_MODEL_DATA"] = str(configs["embed_model_data"]) os.environ["EMBED_MODEL_DATA"] = str(configs["embed_model_data"])
os.environ["VLOG_LEVEL"] = str(configs["vlog_level"]) os.environ["VLOG_LEVEL"] = str(configs["vlog_level"])
os.environ["PROJECT_NAME"] = os.path.splitext(os.path.basename( os.environ["PROJECT_NAME"] = os.path.splitext(
FLAGS.config))[0] os.path.basename(FLAGS.config))[0]
os.environ['INPUT_FILE_NAME'] = "model_input" os.environ['INPUT_FILE_NAME'] = "model_input"
os.environ['OUTPUT_FILE_NAME'] = "model_out" os.environ['OUTPUT_FILE_NAME'] = "model_out"
def main(unused_args): def main(unused_args):
configs = parse_model_configs() configs = parse_model_configs()
if FLAGS.mode == "validate": if FLAGS.mode == "validate":
FLAGS.round = 1 FLAGS.round = 1
FLAGS.restart_round = 1 FLAGS.restart_round = 1
set_environment(configs) set_environment(configs)
if FLAGS.mode == "build" or FLAGS.mode == "all": if FLAGS.mode == "build" or FLAGS.mode == "all":
# Remove previous output dirs # Remove previous output dirs
if not os.path.exists(FLAGS.output_dir): if not os.path.exists(FLAGS.output_dir):
os.makedirs(FLAGS.output_dir) os.makedirs(FLAGS.output_dir)
elif os.path.exists(os.path.join(FLAGS.output_dir, "libmace")): elif os.path.exists(os.path.join(FLAGS.output_dir, "libmace")):
shutil.rmtree(os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"])) shutil.rmtree(
os.makedirs(os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"])) os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"]))
os.makedirs(
generate_version_code() os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"]))
generate_opencl_source_code()
generate_version_code()
option_args = ' '.join([arg for arg in unused_args if arg.startswith('--')]) generate_opencl_source_code()
available_socs = sh_commands.adb_get_all_socs() option_args = ' '.join(
target_socs = available_socs [arg for arg in unused_args if arg.startswith('--')])
if hasattr(configs, "target_socs"):
target_socs = set(configs["target_socs"]) available_socs = sh_commands.adb_get_all_socs()
target_socs = target_socs & available_socs target_socs = available_socs
if hasattr(configs, "target_socs"):
if FLAGS.target_socs != "all": target_socs = set(configs["target_socs"])
socs = set(FLAGS.target_socs.split(',')) target_socs = target_socs & available_socs
target_socs = target_socs & socs
missing_socs = socs.difference(target_socs) if FLAGS.target_socs != "all":
if len(missing_socs) > 0: socs = set(FLAGS.target_socs.split(','))
print("Error: devices with SoCs are not connected %s" % missing_socs) target_socs = target_socs & socs
exit(1) missing_socs = socs.difference(target_socs)
if len(missing_socs) > 0:
print(
for target_soc in target_socs: "Error: devices with SoCs are not connected %s" % missing_socs)
for target_abi in configs["target_abis"]: exit(1)
global_runtime = get_global_runtime(configs)
# Transfer params by environment for target_soc in target_socs:
os.environ["TARGET_ABI"] = target_abi for target_abi in configs["target_abis"]:
model_output_dirs = [] global_runtime = get_global_runtime(configs)
for model_name in configs["models"]: # Transfer params by environment
print '=======================', model_name, '=======================' os.environ["TARGET_ABI"] = target_abi
# Transfer params by environment model_output_dirs = []
os.environ["MODEL_TAG"] = model_name for model_name in configs["models"]:
model_config = configs["models"][model_name] print '===================', model_name, '==================='
input_file_list = model_config.get("validation_inputs_data", []) # Transfer params by environment
for key in model_config: os.environ["MODEL_TAG"] = model_name
if key in ['input_nodes', 'output_nodes'] and isinstance( model_config = configs["models"][model_name]
model_config[key], list): input_file_list = model_config.get("validation_inputs_data",
os.environ[key.upper()] = ",".join(model_config[key]) [])
elif key in ['input_shapes', 'output_shapes'] and isinstance( for key in model_config:
model_config[key], list): if key in ['input_nodes', 'output_nodes'] and isinstance(
os.environ[key.upper()] = ":".join(model_config[key]) model_config[key], list):
else: os.environ[key.upper()] = ",".join(model_config[key])
os.environ[key.upper()] = str(model_config[key]) elif key in ['input_shapes', 'output_shapes'
] and isinstance(model_config[key], list):
# Create model build directory os.environ[key.upper()] = ":".join(model_config[key])
model_path_digest = md5sum(model_config["model_file_path"]) else:
model_output_dir = "%s/%s/%s/%s/%s/%s/%s" % (FLAGS.output_dir, os.environ[key.upper()] = str(model_config[key])
os.environ["PROJECT_NAME"],
"build", model_name, # Create model build directory
model_path_digest, model_path_digest = md5sum(model_config["model_file_path"])
target_soc, target_abi) model_output_dir = "%s/%s/%s/%s/%s/%s/%s" % (
model_output_dirs.append(model_output_dir) FLAGS.output_dir, os.environ["PROJECT_NAME"], "build",
model_name, model_path_digest, target_soc, target_abi)
if FLAGS.mode == "build" or FLAGS.mode == "all": model_output_dirs.append(model_output_dir)
if os.path.exists(model_output_dir):
shutil.rmtree(model_output_dir) if FLAGS.mode == "build" or FLAGS.mode == "all":
os.makedirs(model_output_dir) if os.path.exists(model_output_dir):
clear_env(target_soc) shutil.rmtree(model_output_dir)
os.makedirs(model_output_dir)
download_model_files(model_config["model_file_path"], clear_env(target_soc)
model_output_dir, model_config.get("weight_file_path", ""))
download_model_files(model_config["model_file_path"],
if FLAGS.mode == "build" or FLAGS.mode == "run" or FLAGS.mode == "validate"\ model_output_dir,
or FLAGS.mode == "benchmark" or FLAGS.mode == "all": model_config.get("weight_file_path", ""))
generate_random_input(target_soc, model_output_dir,
model_config['input_nodes'], input_file_list) if FLAGS.mode == "build" or FLAGS.mode == "run" or \
FLAGS.mode == "validate" or \
if FLAGS.mode == "build" or FLAGS.mode == "all": FLAGS.mode == "benchmark" or FLAGS.mode == "all":
generate_model_code() generate_random_input(target_soc, model_output_dir,
build_mace_run_prod(model_name, global_runtime, target_abi, model_config['input_nodes'],
target_soc, model_output_dir, FLAGS.tuning) input_file_list)
if FLAGS.mode == "run" or FLAGS.mode == "validate" or FLAGS.mode == "all": if FLAGS.mode == "build" or FLAGS.mode == "all":
run_model(model_name, global_runtime, target_abi, target_soc, generate_model_code()
model_output_dir, FLAGS.round, FLAGS.restart_round, build_mace_run_prod(model_name, global_runtime, target_abi,
option_args) target_soc, model_output_dir,
FLAGS.tuning)
if FLAGS.mode == "benchmark":
benchmark_model(target_soc, model_output_dir, option_args) if FLAGS.mode == "run" or FLAGS.mode == "validate" or \
FLAGS.mode == "all":
if FLAGS.mode == "validate" or FLAGS.mode == "all": run_model(model_name, global_runtime, target_abi,
validate_model(target_soc, model_output_dir) target_soc, model_output_dir, FLAGS.round,
FLAGS.restart_round, option_args)
if FLAGS.mode == "build" or FLAGS.mode == "merge" or FLAGS.mode == "all":
merge_libs_and_tuning_results( if FLAGS.mode == "benchmark":
target_soc, FLAGS.output_dir + "/" + os.environ["PROJECT_NAME"], benchmark_model(target_soc, model_output_dir, option_args)
model_output_dirs)
if FLAGS.mode == "validate" or FLAGS.mode == "all":
if FLAGS.mode == "throughput_test": validate_model(target_soc, model_output_dir)
merged_lib_file = FLAGS.output_dir + "/%s/%s/libmace_%s.%s.a" % \
(os.environ["PROJECT_NAME"], target_abi, os.environ["PROJECT_NAME"], target_soc) if FLAGS.mode == "build" or FLAGS.mode == "merge" or \
generate_random_input(target_soc, FLAGS.output_dir, [], []) FLAGS.mode == "all":
for model_name in configs["models"]: merge_libs_and_tuning_results(
runtime = configs["models"][model_name]["runtime"] target_soc,
os.environ["%s_MODEL_TAG" % runtime.upper()] = model_name FLAGS.output_dir + "/" + os.environ["PROJECT_NAME"],
build_run_throughput_test(target_soc, FLAGS.run_seconds, model_output_dirs)
merged_lib_file, FLAGS.output_dir)
if FLAGS.mode == "throughput_test":
if FLAGS.mode == "build" or FLAGS.mode == "all": merged_lib_file = FLAGS.output_dir + \
packaging_lib_file(FLAGS.output_dir) "/%s/%s/libmace_%s.%s.a" % \
(os.environ["PROJECT_NAME"], target_abi,
os.environ["PROJECT_NAME"], target_soc)
generate_random_input(target_soc, FLAGS.output_dir, [], [])
for model_name in configs["models"]:
runtime = configs["models"][model_name]["runtime"]
os.environ["%s_MODEL_TAG" % runtime.upper()] = model_name
build_run_throughput_test(target_soc, FLAGS.run_seconds,
merged_lib_file, FLAGS.output_dir)
if FLAGS.mode == "build" or FLAGS.mode == "all":
packaging_lib_file(FLAGS.output_dir)
if __name__ == "__main__": if __name__ == "__main__":
FLAGS, unparsed = parse_args() FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed) main(unused_args=[sys.argv[0]] + unparsed)
...@@ -3,172 +3,205 @@ import re ...@@ -3,172 +3,205 @@ import re
import time import time
import falcon_cli import falcon_cli
################################ ################################
# common # common
################################ ################################
def strip_invalid_utf8(str): def strip_invalid_utf8(str):
return sh.iconv(str, "-c", "-t", "UTF-8") return sh.iconv(str, "-c", "-t", "UTF-8")
def make_output_processor(buff): def make_output_processor(buff):
def process_output(line): def process_output(line):
print(line.strip()) print(line.strip())
buff.append(line) buff.append(line)
return process_output
return process_output
################################ ################################
# adb commands # adb commands
################################ ################################
def adb_split_stdout(stdout_str): def adb_split_stdout(stdout_str):
stdout_str = strip_invalid_utf8(stdout_str) stdout_str = strip_invalid_utf8(stdout_str)
# Filter out last empty line # Filter out last empty line
return [l.strip() for l in stdout_str.split('\n') if len(l.strip()) > 0] return [l.strip() for l in stdout_str.split('\n') if len(l.strip()) > 0]
def adb_devices(target_socs=None): def adb_devices(target_socs=None):
outputs = sh.grep(sh.adb("devices"), "^[A-Za-z0-9]\+[[:space:]]\+device$") outputs = sh.grep(sh.adb("devices"), "^[A-Za-z0-9]\+[[:space:]]\+device$")
raw_lists = sh.cut(outputs, "-f1") raw_lists = sh.cut(outputs, "-f1")
device_ids = adb_split_stdout(raw_lists) device_ids = adb_split_stdout(raw_lists)
if target_socs != None: if target_socs is not None:
target_socs_set = set(target_socs) target_socs_set = set(target_socs)
target_devices = [] target_devices = []
for serialno in device_ids: for serialno in device_ids:
props = adb_getprop_by_serialno(serialno) props = adb_getprop_by_serialno(serialno)
if props["ro.board.platform"] in target_socs_set: if props["ro.board.platform"] in target_socs_set:
target_devices.append(serialno) target_devices.append(serialno)
return target_devices return target_devices
else: else:
return device_ids return device_ids
def adb_getprop_by_serialno(serialno): def adb_getprop_by_serialno(serialno):
outputs = sh.adb("-s", serialno, "shell", "getprop") outputs = sh.adb("-s", serialno, "shell", "getprop")
raw_props = adb_split_stdout(outputs) raw_props = adb_split_stdout(outputs)
props = {} props = {}
p = re.compile("\[(.+)\]: \[(.+)\]") p = re.compile("\[(.+)\]: \[(.+)\]")
for raw_prop in raw_props: for raw_prop in raw_props:
m = p.match(raw_prop) m = p.match(raw_prop)
if m: if m:
props[m.group(1)] = m.group(2) props[m.group(1)] = m.group(2)
return props return props
def adb_supported_abis(serialno): def adb_supported_abis(serialno):
props = adb_getprop_by_serialno(serialno) props = adb_getprop_by_serialno(serialno)
abilist_str = props["ro.product.cpu.abilist"] abilist_str = props["ro.product.cpu.abilist"]
abis = [abi.strip() for abi in abilist_str.split(',')] abis = [abi.strip() for abi in abilist_str.split(',')]
return abis return abis
def adb_get_all_socs(): def adb_get_all_socs():
socs = [] socs = []
for d in adb_devices(): for d in adb_devices():
props = adb_getprop_by_serialno(d) props = adb_getprop_by_serialno(d)
socs.append(props["ro.board.platform"]) socs.append(props["ro.board.platform"])
return set(socs) return set(socs)
def adb_run(serialno, host_bin_path, bin_name,
def adb_run(serialno,
host_bin_path,
bin_name,
args="", args="",
opencl_profiling=1, opencl_profiling=1,
vlog_level=0, vlog_level=0,
device_bin_path="/data/local/tmp/mace", device_bin_path="/data/local/tmp/mace",
out_of_range_check=1): out_of_range_check=1):
host_bin_full_path = "%s/%s" % (host_bin_path, bin_name) host_bin_full_path = "%s/%s" % (host_bin_path, bin_name)
device_bin_full_path = "%s/%s" % (device_bin_path, bin_name) device_bin_full_path = "%s/%s" % (device_bin_path, bin_name)
props = adb_getprop_by_serialno(serialno) props = adb_getprop_by_serialno(serialno)
print("=====================================================================") print(
print("Run on device: %s, %s, %s" % (serialno, props["ro.board.platform"], "====================================================================="
props["ro.product.model"])) )
sh.adb("-s", serialno, "shell", "rm -rf %s" % device_bin_path) print("Run on device: %s, %s, %s" % (serialno, props["ro.board.platform"],
sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path) props["ro.product.model"]))
print("Push %s to %s" % (host_bin_full_path, device_bin_full_path)) sh.adb("-s", serialno, "shell", "rm -rf %s" % device_bin_path)
sh.adb("-s", serialno, "push", host_bin_full_path, device_bin_full_path) sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path)
print("Run %s" % device_bin_full_path) print("Push %s to %s" % (host_bin_full_path, device_bin_full_path))
stdout_buff=[] sh.adb("-s", serialno, "push", host_bin_full_path, device_bin_full_path)
process_output = make_output_processor(stdout_buff) print("Run %s" % device_bin_full_path)
p = sh.adb("-s", serialno, "shell", stdout_buff = []
"MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" % process_output = make_output_processor(stdout_buff)
(out_of_range_check, opencl_profiling, vlog_level, device_bin_full_path, args), p = sh.adb(
_out=process_output, _bg=True, _err_to_out=True) "-s",
p.wait() serialno,
return "".join(stdout_buff) "shell",
"MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d "
"MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" %
(out_of_range_check, opencl_profiling, vlog_level,
device_bin_full_path, args),
_out=process_output,
_bg=True,
_err_to_out=True)
p.wait()
return "".join(stdout_buff)
################################ ################################
# bazel commands # bazel commands
################################ ################################
def bazel_build(target, strip="always", abi="armeabi-v7a"): def bazel_build(target, strip="always", abi="armeabi-v7a"):
print("Build %s with ABI %s" % (target, abi)) print("Build %s with ABI %s" % (target, abi))
stdout_buff=[] stdout_buff = []
process_output = make_output_processor(stdout_buff) process_output = make_output_processor(stdout_buff)
p= sh.bazel("build", p = sh.bazel(
"-c", "opt", "build",
"--strip", strip, "-c",
"--verbose_failures", "opt",
target, "--strip",
"--crosstool_top=//external:android/crosstool", strip,
"--host_crosstool_top=@bazel_tools//tools/cpp:toolchain", "--verbose_failures",
"--cpu=%s" % abi, target,
"--copt=-std=c++11", "--crosstool_top=//external:android/crosstool",
"--copt=-D_GLIBCXX_USE_C99_MATH_TR1", "--host_crosstool_top=@bazel_tools//tools/cpp:toolchain",
"--copt=-DMACE_DISABLE_NO_TUNING_WARNING", "--cpu=%s" % abi,
"--copt=-Werror=return-type", "--copt=-std=c++11",
"--copt=-O3", "--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
"--define", "neon=true", "--copt=-DMACE_DISABLE_NO_TUNING_WARNING",
"--define", "openmp=true", "--copt=-Werror=return-type",
_out=process_output, _bg=True, _err_to_out=True) "--copt=-O3",
p.wait() "--define",
return "".join(stdout_buff) "neon=true",
"--define",
"openmp=true",
_out=process_output,
_bg=True,
_err_to_out=True)
p.wait()
return "".join(stdout_buff)
def bazel_target_to_bin(target): def bazel_target_to_bin(target):
# change //mace/a/b:c to bazel-bin/mace/a/b/c # change //mace/a/b:c to bazel-bin/mace/a/b/c
prefix, bin_name = target.split(':') prefix, bin_name = target.split(':')
prefix = prefix.replace('//', '/') prefix = prefix.replace('//', '/')
if prefix.startswith('/'): if prefix.startswith('/'):
prefix = prefix[1:] prefix = prefix[1:]
host_bin_path = "bazel-bin/%s" % prefix host_bin_path = "bazel-bin/%s" % prefix
return host_bin_path, bin_name return host_bin_path, bin_name
################################ ################################
# mace commands # mace commands
################################ ################################
# TODO this should be refactored # TODO this should be refactored
def gen_encrypted_opencl_source(codegen_path="mace/codegen"): def gen_encrypted_opencl_source(codegen_path="mace/codegen"):
sh.mkdir("-p", "%s/opencl" % codegen_path) sh.mkdir("-p", "%s/opencl" % codegen_path)
sh.python("mace/python/tools/encrypt_opencl_codegen.py", sh.python(
"--cl_kernel_dir=./mace/kernels/opencl/cl/", "mace/python/tools/encrypt_opencl_codegen.py",
"--output_path=%s/opencl/opencl_encrypt_program.cc" % codegen_path) "--cl_kernel_dir=./mace/kernels/opencl/cl/",
"--output_path=%s/opencl/opencl_encrypt_program.cc" % codegen_path)
def gen_mace_version(codegen_path="mace/codegen"): def gen_mace_version(codegen_path="mace/codegen"):
sh.mkdir("-p", "%s/version" % codegen_path) sh.mkdir("-p", "%s/version" % codegen_path)
sh.bash("mace/tools/git/gen_version_source.sh", sh.bash("mace/tools/git/gen_version_source.sh",
"%s/version/version.cc" % codegen_path) "%s/version/version.cc" % codegen_path)
def gen_compiled_opencl_source(codegen_path="mace/codegen"): def gen_compiled_opencl_source(codegen_path="mace/codegen"):
sh.mkdir("-p", "%s/opencl" % codegen_path) sh.mkdir("-p", "%s/opencl" % codegen_path)
sh.python("mace/python/tools/opencl_codegen.py", sh.python(
"--output_path=%s/opencl/opencl_compiled_program.cc" % codegen_path) "mace/python/tools/opencl_codegen.py",
"--output_path=%s/opencl/opencl_compiled_program.cc" % codegen_path)
################################ ################################
# falcon # falcon
################################ ################################
def falcon_tags(tags_dict): def falcon_tags(tags_dict):
tags = "" tags = ""
for k, v in tags_dict.iteritems(): for k, v in tags_dict.iteritems():
if tags == "": if tags == "":
tags = "%s=%s" % (k, v) tags = "%s=%s" % (k, v)
else: else:
tags = tags + ",%s=%s" % (k, v) tags = tags + ",%s=%s" % (k, v)
return tags return tags
def falcon_push_metrics(metrics, endpoint="mace_dev", tags={}):
cli = falcon_cli.FalconCli.connect(server="transfer.falcon.miliao.srv",
port=8433,
debug=False)
ts = int(time.time())
falcon_metrics = [{
"endpoint": endpoint,
"metric": key,
"tags": falcon_tags(tags),
"timestamp": ts,
"value": value,
"step": 86400,
"counterType": "GAUGE"
} for key, value in metrics.iteritems()]
cli.update(falcon_metrics)
def falcon_push_metrics(metrics, endpoint="mace_dev", tags={}):
cli = falcon_cli.FalconCli.connect(
server="transfer.falcon.miliao.srv", port=8433, debug=False)
ts = int(time.time())
falcon_metrics = [{
"endpoint": endpoint,
"metric": key,
"tags": falcon_tags(tags),
"timestamp": ts,
"value": value,
"step": 86400,
"counterType": "GAUGE"
} for key, value in metrics.iteritems()]
cli.update(falcon_metrics)
...@@ -20,175 +20,172 @@ from scipy import stats ...@@ -20,175 +20,172 @@ from scipy import stats
# --input_shape 1,64,64,3 \ # --input_shape 1,64,64,3 \
# --output_shape 1,64,64,2 # --output_shape 1,64,64,2
def load_data(file): def load_data(file):
if os.path.isfile(file): if os.path.isfile(file):
return np.fromfile(file=file, dtype=np.float32) return np.fromfile(file=file, dtype=np.float32)
else: else:
return np.empty([0]) return np.empty([0])
def format_output_name(name): def format_output_name(name):
return re.sub('[^0-9a-zA-Z]+', '_', name) return re.sub('[^0-9a-zA-Z]+', '_', name)
def compare_output(output_name, mace_out_value, out_value): def compare_output(output_name, mace_out_value, out_value):
if mace_out_value.size != 0: if mace_out_value.size != 0:
out_value = out_value.reshape(-1) out_value = out_value.reshape(-1)
mace_out_value = mace_out_value.reshape(-1) mace_out_value = mace_out_value.reshape(-1)
assert len(out_value) == len(mace_out_value) assert len(out_value) == len(mace_out_value)
similarity = (1 - spatial.distance.cosine(out_value, mace_out_value)) similarity = (1 - spatial.distance.cosine(out_value, mace_out_value))
print output_name, 'MACE VS', FLAGS.platform.upper(), 'similarity: ', similarity print output_name, 'MACE VS', FLAGS.platform.upper(
if (FLAGS.mace_runtime == "cpu" and similarity > 0.999) or \ ), 'similarity: ', similarity
(FLAGS.mace_runtime == "neon" and similarity > 0.999) or \ if (FLAGS.mace_runtime == "cpu" and similarity > 0.999) or \
(FLAGS.mace_runtime == "gpu" and similarity > 0.995) or \ (FLAGS.mace_runtime == "neon" and similarity > 0.999) or \
(FLAGS.mace_runtime == "dsp" and similarity > 0.930): (FLAGS.mace_runtime == "gpu" and similarity > 0.995) or \
print '=======================Similarity Test Passed======================' (FLAGS.mace_runtime == "dsp" and similarity > 0.930):
print '===================Similarity Test Passed=================='
else:
print '===================Similarity Test Failed=================='
sys.exit(-1)
else: else:
print '=======================Similarity Test Failed======================' print '=======================Skip empty node==================='
sys.exit(-1) sys.exit(-1)
else:
print '=======================Skip empty node==================='
sys.exit(-1)
def validate_tf_model(input_names, input_shapes, output_names): def validate_tf_model(input_names, input_shapes, output_names):
import tensorflow as tf import tensorflow as tf
if not os.path.isfile(FLAGS.model_file): if not os.path.isfile(FLAGS.model_file):
print("Input graph file '" + FLAGS.model_file + "' does not exist!") print("Input graph file '" + FLAGS.model_file + "' does not exist!")
sys.exit(-1) sys.exit(-1)
input_graph_def = tf.GraphDef() input_graph_def = tf.GraphDef()
with open(FLAGS.model_file, "rb") as f: with open(FLAGS.model_file, "rb") as f:
data = f.read() data = f.read()
input_graph_def.ParseFromString(data) input_graph_def.ParseFromString(data)
tf.import_graph_def(input_graph_def, name="")
with tf.Session() as session:
with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name="") tf.import_graph_def(input_graph_def, name="")
input_dict = {}
for i in range(len(input_names)): with tf.Session() as session:
input_value = load_data(FLAGS.input_file + "_" + input_names[i]) with session.graph.as_default() as graph:
input_value = input_value.reshape(input_shapes[i]) tf.import_graph_def(input_graph_def, name="")
input_node = graph.get_tensor_by_name(input_names[i] + ':0') input_dict = {}
input_dict[input_node] = input_value for i in range(len(input_names)):
input_value = load_data(
output_nodes = [] FLAGS.input_file + "_" + input_names[i])
for name in output_names: input_value = input_value.reshape(input_shapes[i])
output_nodes.extend([graph.get_tensor_by_name(name + ':0')]) input_node = graph.get_tensor_by_name(
output_values = session.run(output_nodes, feed_dict=input_dict) input_names[i] + ':0')
for i in range(len(output_names)): input_dict[input_node] = input_value
output_file_name = FLAGS.mace_out_file + "_" + format_output_name(output_names[i])
mace_out_value = load_data(output_file_name) output_nodes = []
compare_output(output_names[i], mace_out_value, output_values[i]) for name in output_names:
output_nodes.extend(
def validate_caffe_model(input_names, input_shapes, output_names, output_shapes): [graph.get_tensor_by_name(name + ':0')])
os.environ['GLOG_minloglevel'] = '1' # suprress Caffe verbose prints output_values = session.run(output_nodes, feed_dict=input_dict)
import caffe for i in range(len(output_names)):
if not os.path.isfile(FLAGS.model_file): output_file_name = FLAGS.mace_out_file + "_" + \
print("Input graph file '" + FLAGS.model_file + "' does not exist!") format_output_name(output_names[i])
sys.exit(-1) mace_out_value = load_data(output_file_name)
if not os.path.isfile(FLAGS.weight_file): compare_output(output_names[i], mace_out_value,
print("Input weight file '" + FLAGS.weight_file + "' does not exist!") output_values[i])
sys.exit(-1)
caffe.set_mode_cpu() def validate_caffe_model(input_names, input_shapes, output_names,
output_shapes):
net = caffe.Net(FLAGS.model_file, caffe.TEST, weights=FLAGS.weight_file) os.environ['GLOG_minloglevel'] = '1' # suprress Caffe verbose prints
import caffe
for i in range(len(input_names)): if not os.path.isfile(FLAGS.model_file):
input_value = load_data(FLAGS.input_file + "_" + input_names[i]) print("Input graph file '" + FLAGS.model_file + "' does not exist!")
input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1, 2)) sys.exit(-1)
input_blob_name = input_names[i] if not os.path.isfile(FLAGS.weight_file):
try: print("Input weight file '" + FLAGS.weight_file + "' does not exist!")
if input_names[i] in net.top_names: sys.exit(-1)
input_blob_name = net.top_names[input_names[i]][0]
except ValueError: caffe.set_mode_cpu()
pass
net.blobs[input_blob_name].data[0] = input_value net = caffe.Net(FLAGS.model_file, caffe.TEST, weights=FLAGS.weight_file)
net.forward() for i in range(len(input_names)):
input_value = load_data(FLAGS.input_file + "_" + input_names[i])
for i in range(len(output_names)): input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1,
value = net.blobs[net.top_names[output_names[i]][0]].data 2))
out_shape = output_shapes[i] input_blob_name = input_names[i]
out_shape[1], out_shape[2], out_shape[3] = out_shape[3], out_shape[1], out_shape[2] try:
value = value.reshape(out_shape).transpose((0, 2, 3, 1)) if input_names[i] in net.top_names:
output_file_name = FLAGS.mace_out_file + "_" + format_output_name(output_names[i]) input_blob_name = net.top_names[input_names[i]][0]
mace_out_value = load_data(output_file_name) except ValueError:
compare_output(output_names[i], mace_out_value, value) pass
net.blobs[input_blob_name].data[0] = input_value
net.forward()
for i in range(len(output_names)):
value = net.blobs[net.top_names[output_names[i]][0]].data
out_shape = output_shapes[i]
out_shape[1], out_shape[2], out_shape[3] = out_shape[3], out_shape[
1], out_shape[2]
value = value.reshape(out_shape).transpose((0, 2, 3, 1))
output_file_name = FLAGS.mace_out_file + "_" + format_output_name(
output_names[i])
mace_out_value = load_data(output_file_name)
compare_output(output_names[i], mace_out_value, value)
def main(unused_args): def main(unused_args):
input_names = [name for name in FLAGS.input_node.split(',')] input_names = [name for name in FLAGS.input_node.split(',')]
input_shape_strs = [shape for shape in FLAGS.input_shape.split(':')] input_shape_strs = [shape for shape in FLAGS.input_shape.split(':')]
input_shapes = [[int(x) for x in shape.split(',')] for shape in input_shape_strs] input_shapes = [[int(x) for x in shape.split(',')]
output_names = [name for name in FLAGS.output_node.split(',')] for shape in input_shape_strs]
assert len(input_names) == len(input_shapes) output_names = [name for name in FLAGS.output_node.split(',')]
assert len(input_names) == len(input_shapes)
if FLAGS.platform == 'tensorflow':
validate_tf_model(input_names, input_shapes, output_names) if FLAGS.platform == 'tensorflow':
elif FLAGS.platform == 'caffe': validate_tf_model(input_names, input_shapes, output_names)
output_shape_strs = [shape for shape in FLAGS.output_shape.split(':')] elif FLAGS.platform == 'caffe':
output_shapes = [[int(x) for x in shape.split(',')] for shape in output_shape_strs] output_shape_strs = [shape for shape in FLAGS.output_shape.split(':')]
validate_caffe_model(input_names, input_shapes, output_names, output_shapes) output_shapes = [[int(x) for x in shape.split(',')]
for shape in output_shape_strs]
validate_caffe_model(input_names, input_shapes, output_names,
output_shapes)
def parse_args(): def parse_args():
"""Parses command line arguments.""" """Parses command line arguments."""
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true") parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument( parser.add_argument(
"--platform", "--platform", type=str, default="", help="Tensorflow or Caffe.")
type=str, parser.add_argument(
default="", "--model_file",
help="Tensorflow or Caffe.") type=str,
parser.add_argument( default="",
"--model_file", help="TensorFlow or Caffe \'GraphDef\' file to load.")
type=str, parser.add_argument(
default="", "--weight_file",
help="TensorFlow or Caffe \'GraphDef\' file to load.") type=str,
parser.add_argument( default="",
"--weight_file", help="caffe model file to load.")
type=str, parser.add_argument(
default="", "--input_file", type=str, default="", help="input file.")
help="caffe model file to load.") parser.add_argument(
parser.add_argument( "--mace_out_file",
"--input_file", type=str,
type=str, default="",
default="", help="mace output file to load.")
help="input file.") parser.add_argument(
parser.add_argument( "--mace_runtime", type=str, default="gpu", help="mace runtime device.")
"--mace_out_file", parser.add_argument(
type=str, "--input_shape", type=str, default="1,64,64,3", help="input shape.")
default="", parser.add_argument(
help="mace output file to load.") "--output_shape", type=str, default="1,64,64,2", help="output shape.")
parser.add_argument( parser.add_argument(
"--mace_runtime", "--input_node", type=str, default="input_node", help="input node")
type=str, parser.add_argument(
default="gpu", "--output_node", type=str, default="output_node", help="output node")
help="mace runtime device.")
parser.add_argument( return parser.parse_known_args()
"--input_shape",
type=str,
default="1,64,64,3",
help="input shape.")
parser.add_argument(
"--output_shape",
type=str,
default="1,64,64,2",
help="output shape.")
parser.add_argument(
"--input_node",
type=str,
default="input_node",
help="input node")
parser.add_argument(
"--output_node",
type=str,
default="output_node",
help="output node")
return parser.parse_known_args()
if __name__ == '__main__': if __name__ == '__main__':
FLAGS, unparsed = parse_args() FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed) main(unused_args=[sys.argv[0]] + unparsed)
...@@ -11,199 +11,195 @@ G_T = {} ...@@ -11,199 +11,195 @@ G_T = {}
# f(2, 3) # f(2, 3)
A_T[4] = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]).astype(np.float32) A_T[4] = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]).astype(np.float32)
A[4] = np.transpose(A_T[4]) A[4] = np.transpose(A_T[4])
B_T[4] = np.array([ B_T[4] = np.array([[1, 0, -1, 0], [0, 1, 1, 0], [0, -1, 1, 0],
[1, 0, -1, 0], [0, 1, 0, -1]]).astype(np.float32)
[0, 1, 1, 0],
[0, -1, 1, 0],
[0, 1, 0, -1]
]).astype(np.float32)
B[4] = np.transpose(B_T[4]) B[4] = np.transpose(B_T[4])
G[4] = np.array([ G[4] = np.array([
[1, 0, 0], [1, 0, 0],
[0.5, 0.5, 0.5], [0.5, 0.5, 0.5],
[0.5, -0.5, 0.5], [0.5, -0.5, 0.5],
[0, 0, 1], [0, 0, 1],
]).astype(np.float32) ]).astype(np.float32)
G_T[4] = np.transpose(G[4]) G_T[4] = np.transpose(G[4])
# f(4, 3) # f(4, 3)
A_T[6] = np.array([ A_T[6] = np.array([
[1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 0],
[0, 1, -1, 2, -2, 0], [0, 1, -1, 2, -2, 0],
[0, 1, 1, 4, 4, 0], [0, 1, 1, 4, 4, 0],
[0, 1, -1, 8, -8, 1], [0, 1, -1, 8, -8, 1],
]).astype(np.float32) ]).astype(np.float32)
A[6] = np.transpose(A_T[6]) A[6] = np.transpose(A_T[6])
B_T[6] = np.array([ B_T[6] = np.array([
[4, 0, -5, 0, 1, 0], [4, 0, -5, 0, 1, 0],
[0, -4, -4, 1, 1, 0], [0, -4, -4, 1, 1, 0],
[0, 4, -4, -1, 1, 0], [0, 4, -4, -1, 1, 0],
[0, -2, -1, 2, 1, 0], [0, -2, -1, 2, 1, 0],
[0, 2, -1, -2, 1, 0], [0, 2, -1, -2, 1, 0],
[0, 4, 0, -5, 0, 1], [0, 4, 0, -5, 0, 1],
]).astype(np.float32) ]).astype(np.float32)
B[6] = np.transpose(B_T[6]) B[6] = np.transpose(B_T[6])
G[6] = np.array([ G[6] = np.array([
[1/4.0 , 0 , 0 ], [1 / 4.0, 0, 0],
[-1/6.0, -1/6.0 , -1/6.0], [-1 / 6.0, -1 / 6.0, -1 / 6.0],
[-1/6.0, 1/6.0 , -1/6.0], [-1 / 6.0, 1 / 6.0, -1 / 6.0],
[1/24.0, 1/12.0 , 1/6.0 ], [1 / 24.0, 1 / 12.0, 1 / 6.0],
[1/24.0, -1/12.0, 1/6.0 ], [1 / 24.0, -1 / 12.0, 1 / 6.0],
[ 0 , 0 , 1 ], [0, 0, 1],
]).astype(np.float32) ]).astype(np.float32)
G_T[6] = np.transpose(G[6]) G_T[6] = np.transpose(G[6])
# f(6, 3) # f(6, 3)
A_T[8] = np.array([ A_T[8] = np.array([
[1, 1, 1 , 1 , 1 , 1 , 1 , 0], [1, 1, 1, 1, 1, 1, 1, 0],
[0, 1, -1, 2 , -2 , 1/2. , -1/2. , 0], [0, 1, -1, 2, -2, 1 / 2., -1 / 2., 0],
[0, 1, 1 , 4 , 4 , 1/4. , 1/4. , 0], [0, 1, 1, 4, 4, 1 / 4., 1 / 4., 0],
[0, 1, -1, 8 , -8 , 1/8. , -1/8. , 0], [0, 1, -1, 8, -8, 1 / 8., -1 / 8., 0],
[0, 1, 1 , 16, 16 , 1/16., 1/16. , 0], [0, 1, 1, 16, 16, 1 / 16., 1 / 16., 0],
[0, 1, -1, 32, -32, 1/32., -1/32., 1], [0, 1, -1, 32, -32, 1 / 32., -1 / 32., 1],
]).astype(np.float32) ]).astype(np.float32)
A[8] = np.transpose(A_T[8]) A[8] = np.transpose(A_T[8])
B_T[8] = np.array([ B_T[8] = np.array([
[1, 0 , -21/4., 0 , 21/4., 0 , -1, 0], [1, 0, -21 / 4., 0, 21 / 4., 0, -1, 0],
[0, 1 , 1 , -17/4., -17/4., 1 , 1 , 0], [0, 1, 1, -17 / 4., -17 / 4., 1, 1, 0],
[0, -1 , 1 , 17/4. , -17/4., -1 , 1 , 0], [0, -1, 1, 17 / 4., -17 / 4., -1, 1, 0],
[0, 1/2. , 1/4. , -5/2. , -5/4., 2 , 1 , 0], [0, 1 / 2., 1 / 4., -5 / 2., -5 / 4., 2, 1, 0],
[0, -1/2., 1/4. , 5/2. , -5/4., -2 , 1 , 0], [0, -1 / 2., 1 / 4., 5 / 2., -5 / 4., -2, 1, 0],
[0, 2 , 4 , -5/2. , -5 , 1/2. , 1 , 0], [0, 2, 4, -5 / 2., -5, 1 / 2., 1, 0],
[0, -2 , 4 , 5/2. , -5 , -1/2. , 1 , 0], [0, -2, 4, 5 / 2., -5, -1 / 2., 1, 0],
[0, -1 , 0 , 21/4. , 0 , -21/4., 0 , 1], [0, -1, 0, 21 / 4., 0, -21 / 4., 0, 1],
]).astype(np.float32) ]).astype(np.float32)
B[8] = np.transpose(B_T[8]) B[8] = np.transpose(B_T[8])
G[8] = np.array([ G[8] = np.array([
[ 1 , 0 , 0 ], [1, 0, 0],
[-2/9. , -2/9. , -2/9.], [-2 / 9., -2 / 9., -2 / 9.],
[-2/9. , 2/9. , -2/9.], [-2 / 9., 2 / 9., -2 / 9.],
[1/90. , 1/45. , 2/45.], [1 / 90., 1 / 45., 2 / 45.],
[1/90. , -1/45. , 2/45.], [1 / 90., -1 / 45., 2 / 45.],
[32/45., 16/45. , 8/45.], [32 / 45., 16 / 45., 8 / 45.],
[32/45., -16/45., 8/45.], [32 / 45., -16 / 45., 8 / 45.],
[ 0 , 0 , 1 ], [0, 0, 1],
]).astype(np.float32) ]).astype(np.float32)
G_T[8] = np.transpose(G[8]) G_T[8] = np.transpose(G[8])
def output_shape(input_shape, filter_shape): def output_shape(input_shape, filter_shape):
out_shape = np.zeros(4).astype(np.int32) out_shape = np.zeros(4).astype(np.int32)
out_shape[0] = input_shape[0] out_shape[0] = input_shape[0]
out_shape[1] = filter_shape[0] out_shape[1] = filter_shape[0]
out_shape[2] = input_shape[2] - 2 out_shape[2] = input_shape[2] - 2
out_shape[3] = input_shape[3] - 2 out_shape[3] = input_shape[3] - 2
return out_shape return out_shape
def winograd_conv(m, r, input, filter): def winograd_conv(m, r, input, filter):
alpha = m + r - 1 alpha = m + r - 1
print 'Winograd(m = %d, r = %d, tile size=%d' % (m, r, alpha) print 'Winograd(m = %d, r = %d, tile size=%d' % (m, r, alpha)
alpha_square = alpha * alpha alpha_square = alpha * alpha
input_shape = input.shape input_shape = input.shape
filter_shape = filter.shape filter_shape = filter.shape
out_shape = output_shape(input_shape, filter_shape) out_shape = output_shape(input_shape, filter_shape)
K = filter_shape[0] K = filter_shape[0]
C = input_shape[1] C = input_shape[1]
U = np.zeros((K * alpha_square, C)) U = np.zeros((K * alpha_square, C))
for k in range(K): for k in range(K):
for c in range(C): for c in range(C):
u = np.dot(np.dot(G[alpha], filter[k, c, :, :]), G_T[alpha]) u = np.dot(np.dot(G[alpha], filter[k, c, :, :]), G_T[alpha])
for i in range(alpha): for i in range(alpha):
for j in range(alpha) : for j in range(alpha):
U[(i * alpha + j) * K + k, c] = u[i, j] U[(i * alpha + j) * K + k, c] = u[i, j]
print 'filter out: ', U.shape print 'filter out: ', U.shape
rounded_h = int(math.ceil(out_shape[2] / (m * 1.0))) rounded_h = int(math.ceil(out_shape[2] / (m * 1.0)))
rounded_w = int(math.ceil(out_shape[3] / (m * 1.0))) rounded_w = int(math.ceil(out_shape[3] / (m * 1.0)))
P = input_shape[0] * rounded_h * rounded_w P = input_shape[0] * rounded_h * rounded_w
V = np.zeros((C * alpha_square, P)) V = np.zeros((C * alpha_square, P))
for p in range(P): for p in range(P):
for c in range(C): for c in range(C):
n = p / (rounded_w * rounded_h) n = p / (rounded_w * rounded_h)
t = p % (rounded_h * rounded_w) t = p % (rounded_h * rounded_w)
h_idx = t / rounded_w h_idx = t / rounded_w
w_idx = t % rounded_w w_idx = t % rounded_w
h_start = h_idx * m h_start = h_idx * m
w_start = w_idx * m w_start = w_idx * m
h_end = min(h_start+alpha, input_shape[2]) h_end = min(h_start + alpha, input_shape[2])
w_end = min(w_start+alpha, input_shape[3]) w_end = min(w_start + alpha, input_shape[3])
d = np.zeros((alpha, alpha)) d = np.zeros((alpha, alpha))
d[0:h_end-h_start, 0:w_end-w_start] = \ d[0:h_end-h_start, 0:w_end-w_start] = \
input[n, c, h_start:h_end, w_start:w_end] input[n, c, h_start:h_end, w_start:w_end]
v = np.dot(np.dot(B_T[alpha], d), B[alpha]) v = np.dot(np.dot(B_T[alpha], d), B[alpha])
for i in range(alpha): for i in range(alpha):
for j in range(alpha): for j in range(alpha):
V[(i*alpha+j)*C + c, p] = v[i, j] V[(i * alpha + j) * C + c, p] = v[i, j]
tmp = V.reshape(alpha_square, C, P, 1) tmp = V.reshape(alpha_square, C, P, 1)
print 'input out: ', tmp.shape print 'input out: ', tmp.shape
tmp.astype(np.float32).tofile("C") tmp.astype(np.float32).tofile("C")
M = np.zeros((alpha_square * K, P)) M = np.zeros((alpha_square * K, P))
for i in range(alpha_square): for i in range(alpha_square):
u = U[i * K : (i+1) * K, :] u = U[i * K:(i + 1) * K, :]
v = V[i * C : (i+1) * C, :] v = V[i * C:(i + 1) * C, :]
M[i * K : (i+1) * K, :] = np.dot(u, v) M[i * K:(i + 1) * K, :] = np.dot(u, v)
print 'M shape: ', M.shape print 'M shape: ', M.shape
M.astype(np.float32).tofile("gemm") M.astype(np.float32).tofile("gemm")
res = np.zeros((out_shape[0], out_shape[2], out_shape[3], out_shape[1])) res = np.zeros((out_shape[0], out_shape[2], out_shape[3], out_shape[1]))
for k in range(K): for k in range(K):
for b in range(P): for b in range(P):
tm = np.zeros((alpha, alpha)) tm = np.zeros((alpha, alpha))
for i in range(alpha): for i in range(alpha):
for j in range(alpha): for j in range(alpha):
tm[i][j] = M[(i*alpha+j) * K + k, b] tm[i][j] = M[(i * alpha + j) * K + k, b]
y = np.dot(np.dot(A_T[alpha], tm), A[alpha]) y = np.dot(np.dot(A_T[alpha], tm), A[alpha])
for i in range(m): for i in range(m):
for j in range(m): for j in range(m):
n = b / (rounded_h * rounded_w) n = b / (rounded_h * rounded_w)
t = b % (rounded_h * rounded_w) t = b % (rounded_h * rounded_w)
p = (t / rounded_w) * m + i p = (t / rounded_w) * m + i
q = (t % rounded_w) * m + j q = (t % rounded_w) * m + j
if p >= out_shape[2] or q >= out_shape[3]: if p >= out_shape[2] or q >= out_shape[3]:
continue continue
res[n, p, q, k] = y[i, j] res[n, p, q, k] = y[i, j]
print 'Res shape: ', res.shape print 'Res shape: ', res.shape
res.astype(np.float32).tofile("res") res.astype(np.float32).tofile("res")
return res return res
def tf_conv(input, filter): def tf_conv(input, filter):
conv_op = tf.nn.conv2d(input, filter, [1, 1, 1, 1], 'VALID') conv_op = tf.nn.conv2d(input, filter, [1, 1, 1, 1], 'VALID')
with tf.Session() as sess: with tf.Session() as sess:
res = sess.run(conv_op) res = sess.run(conv_op)
return res return res
def main(): def main():
input = np.random.random([5, 23, 29, 15]).astype(np.float32) input = np.random.random([5, 23, 29, 15]).astype(np.float32)
# input = np.fromfile(file="A", dtype=np.float32) # input = np.fromfile(file="A", dtype=np.float32)
# input = input.reshape(1, 3, 3, 5) # input = input.reshape(1, 3, 3, 5)
print 'input shape: ', input.shape print 'input shape: ', input.shape
# input.tofile("A") # input.tofile("A")
filter = np.random.random([3, 3, 15, 13]).astype(np.float32) filter = np.random.random([3, 3, 15, 13]).astype(np.float32)
tf_out = tf_conv(input, filter) tf_out = tf_conv(input, filter)
input = input.transpose((0, 3, 1, 2)) input = input.transpose((0, 3, 1, 2))
filter = filter.transpose((3, 2, 0, 1)) filter = filter.transpose((3, 2, 0, 1))
print 'filter shape: ', filter.shape print 'filter shape: ', filter.shape
# filter.tofile("filter_in") # filter.tofile("filter_in")
for i in [2, 4, 6]: for i in [2, 4, 6]:
print "==========f(%d,3)==========" % i print "==========f(%d,3)==========" % i
winograd_out = winograd_conv(i, 3, input, filter) winograd_out = winograd_conv(i, 3, input, filter)
res = np.allclose(tf_out, winograd_out) res = np.allclose(tf_out, winograd_out)
if res: if res:
print "=========Pass=========" print "=========Pass========="
else: else:
print "=========Failed=======" print "=========Failed======="
print "TF: ", tf_out print "TF: ", tf_out
print "Winograd: ", winograd_out print "Winograd: ", winograd_out
if __name__ == '__main__': if __name__ == '__main__':
main() main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册