提交 6da30d22 编写于 作者: L Liangliang He

Enable python style check

上级 e54825c5
stages:
- cpplint
- pycodestyle
- ops_test
- ops_benchmark
......@@ -7,7 +8,12 @@ cpplint:
stage: cpplint
script:
- curl -o cpplint.py https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py
- python cpplint.py --linelength=80 --counting=detailed $(find mace -name *.h -or -name *.cc)
- python cpplint.py --linelength=80 --counting=detailed $(find mace -name "*.h" -or -name "*.cc")
pycodestyle:
stage: pycodestyle
script:
- pycodestyle $(find -name "*.py")
ops_test:
stage: ops_test
......
......@@ -113,7 +113,8 @@ RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
scipy \
jinja2 \
pyyaml \
sh
sh \
pycodestyle
# Download tensorflow tools
RUN wget http://cnbj1-inner-fds.api.xiaomi.net/mace/tool/transform_graph && \
......
......@@ -16,74 +16,75 @@ FLAGS = None
def generate_cpp_source():
data_map = {}
for binary_dir in FLAGS.binary_dirs.split(","):
binary_path = os.path.join(binary_dir, FLAGS.binary_file_name)
if not os.path.exists(binary_path):
continue
data_map = {}
for binary_dir in FLAGS.binary_dirs.split(","):
binary_path = os.path.join(binary_dir, FLAGS.binary_file_name)
if not os.path.exists(binary_path):
continue
with open(binary_path, "rb") as f:
binary_array = np.fromfile(f, dtype=np.uint8)
with open(binary_path, "rb") as f:
binary_array = np.fromfile(f, dtype=np.uint8)
print "Generate binary from", binary_path
idx = 0
size, = struct.unpack("Q", binary_array[idx:idx+8])
idx += 8
for _ in xrange(size):
key_size, = struct.unpack("i", binary_array[idx:idx+4])
idx += 4
key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size])
idx += key_size
params_size, = struct.unpack("i", binary_array[idx:idx+4])
idx += 4
data_map[key] = []
count = params_size / 4
params = struct.unpack(str(count) + "i", binary_array[idx:idx+params_size])
for i in params:
data_map[key].append(i)
idx += params_size
print "Generate binary from", binary_path
idx = 0
size, = struct.unpack("Q", binary_array[idx:idx + 8])
idx += 8
for _ in xrange(size):
key_size, = struct.unpack("i", binary_array[idx:idx + 4])
idx += 4
key, = struct.unpack(
str(key_size) + "s", binary_array[idx:idx + key_size])
idx += key_size
params_size, = struct.unpack("i", binary_array[idx:idx + 4])
idx += 4
data_map[key] = []
count = params_size / 4
params = struct.unpack(
str(count) + "i", binary_array[idx:idx + params_size])
for i in params:
data_map[key].append(i)
idx += params_size
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
return env.get_template('str2vec_maps.cc.jinja2').render(
maps=data_map,
data_type='unsigned int',
variable_name=FLAGS.variable_name)
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
return env.get_template('str2vec_maps.cc.jinja2').render(
maps = data_map,
data_type = 'unsigned int',
variable_name = FLAGS.variable_name
)
def main(unused_args):
cpp_binary_source = generate_cpp_source()
if os.path.isfile(FLAGS.output_path):
os.remove(FLAGS.output_path)
w_file = open(FLAGS.output_path, "w")
w_file.write(cpp_binary_source)
w_file.close()
cpp_binary_source = generate_cpp_source()
if os.path.isfile(FLAGS.output_path):
os.remove(FLAGS.output_path)
w_file = open(FLAGS.output_path, "w")
w_file.write(cpp_binary_source)
w_file.close()
def parse_args():
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--binary_dirs",
type=str,
default="",
help="The binaries file path.")
parser.add_argument(
"--binary_file_name",
type=str,
default="mace_run.config",
help="The binary file name.")
parser.add_argument(
"--output_path",
type=str,
default="",
help="The path of generated C++ source file which contains the binary.")
parser.add_argument(
"--variable_name",
type=str,
default="kTuningParamsData",
help="global variable name.")
return parser.parse_known_args()
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--binary_dirs", type=str, default="", help="The binaries file path.")
parser.add_argument(
"--binary_file_name",
type=str,
default="mace_run.config",
help="The binary file name.")
parser.add_argument(
"--output_path",
type=str,
default="",
help="The path of generated C++ source file which contains the binary."
)
parser.add_argument(
"--variable_name",
type=str,
default="kTuningParamsData",
help="global variable name.")
return parser.parse_known_args()
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
......@@ -5,32 +5,26 @@ import google.protobuf.text_format
import numpy as np
import math
pooling_type_mode = {
'AvgPool': 1,
'MaxPool': 2
}
pooling_type_mode = {'AvgPool': 1, 'MaxPool': 2}
buffer_type_map = {
'CONV2D_FILTER' : 0,
'IN_OUT_CHANNEL' : 1,
'ARGUMENT' : 2,
'IN_OUT_HEIGHT' : 3,
'IN_OUT_WIDTH' : 4,
'WINOGRAD_FILTER' : 5,
'DW_CONV2D_FILTER' : 6,
'WEIGHT_HEIGHT' : 7,
'WEIGHT_WIDTH' : 8,
'CONV2D_FILTER': 0,
'IN_OUT_CHANNEL': 1,
'ARGUMENT': 2,
'IN_OUT_HEIGHT': 3,
'IN_OUT_WIDTH': 4,
'WINOGRAD_FILTER': 5,
'DW_CONV2D_FILTER': 6,
'WEIGHT_HEIGHT': 7,
'WEIGHT_WIDTH': 8,
}
data_type_map = {
'DT_HALF' : mace_pb2.DT_HALF,
'DT_FLOAT': mace_pb2.DT_FLOAT
}
data_type_map = {'DT_HALF': mace_pb2.DT_HALF, 'DT_FLOAT': mace_pb2.DT_FLOAT}
activation_name_map = {
'ReLU' : 'RELU',
'Sigmoid' : 'SIGMOID',
'TanH' : 'TANH',
'ReLU': 'RELU',
'Sigmoid': 'SIGMOID',
'TanH': 'TANH',
}
MACE_INPUT_NODE_NAME = "mace_input_node"
......@@ -38,1022 +32,1102 @@ MACE_OUTPUT_NODE_NAME = "mace_output_node"
OPENCL_IMAGE_MAX_SIZE = 16384
class Operator(object):
def __init__(self, name, type, layer):
self.name = name
self.type = type
self.layer = layer
self.parents = []
self.children = []
self.data = []
self.output_shape_map = {}
def add_parent(self, parent_op):
self.parents.append(parent_op)
parent_op.children.append(self)
def get_single_parent(self):
if len(self.parents) != 1:
raise Exception('Operation %s expected single parent, but got %s'
% (self.name, len(self.parents)))
return self.parents[0]
def __init__(self, name, type, layer):
self.name = name
self.type = type
self.layer = layer
self.parents = []
self.children = []
self.data = []
self.output_shape_map = {}
def add_parent(self, parent_op):
self.parents.append(parent_op)
parent_op.children.append(self)
def get_single_parent(self):
if len(self.parents) != 1:
raise Exception('Operation %s expected single parent, but got %s' %
(self.name, len(self.parents)))
return self.parents[0]
def BlobToNPArray(blob):
if blob.num != 0:
return (np.asarray(blob.data, dtype=np.float32).
reshape((blob.num, blob.channels, blob.height, blob.width)))
else:
return np.asarray(blob.data, dtype=np.float32).reshape(blob.shape.dim)
if blob.num != 0:
return (np.asarray(blob.data, dtype=np.float32).reshape(
(blob.num, blob.channels, blob.height, blob.width)))
else:
return np.asarray(blob.data, dtype=np.float32).reshape(blob.shape.dim)
class Shapes(object):
@staticmethod
def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func, input_format='NHWC'):
output_shape = np.zeros_like(input_shape)
output_shape[0] = input_shape[0]
if input_format == 'NHWC':
# input format: NHWC, filter format: HWOI
output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0]
- (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1]
- (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
output_shape[3] = filter_shape[2]
elif input_format == 'NCHW':
# input format: NCHW, filter format: OIHW
output_shape[1] = filter_shape[0]
output_shape[2] = int(round_func((input_shape[2] + paddings[0] - filter_shape[2]
- (filter_shape[2] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
output_shape[3] = int(round_func((input_shape[3] + paddings[1] - filter_shape[3]
- (filter_shape[3] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
else:
raise Exception("format %s is not supported" % input_format)
return output_shape
@staticmethod
def fully_connected_shape(input_shape, weight_shape):
return [input_shape[0], 1, 1, weight_shape[0]]
@staticmethod
def concat_shape(input_shapes, axis):
output_shape = None
for input_shape in input_shapes:
if output_shape is None:
output_shape = list(input_shape)
else:
output_shape[axis] += input_shape[axis]
return output_shape
@staticmethod
def slice_shape(input_shape, num_output, input_format='NHWC'):
if input_format == 'NHWC':
return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output]
elif input_format == 'NCHW':
return [input_shape[0], input_shape[1]/num_output, input_shape[2], input_shape[3]]
else:
raise Exception("format %s is not supported" % input_format)
@staticmethod
def conv_pool_shape(input_shape,
filter_shape,
paddings,
strides,
dilations,
round_func,
input_format='NHWC'):
output_shape = np.zeros_like(input_shape)
output_shape[0] = input_shape[0]
if input_format == 'NHWC':
# input format: NHWC, filter format: HWOI
output_shape[1] = int(
round_func((input_shape[1] + paddings[0] - filter_shape[0] -
(filter_shape[0] - 1) *
(dilations[0] - 1)) / float(strides[0]))) + 1
output_shape[2] = int(
round_func((input_shape[2] + paddings[1] - filter_shape[1] -
(filter_shape[1] - 1) *
(dilations[1] - 1)) / float(strides[1]))) + 1
output_shape[3] = filter_shape[2]
elif input_format == 'NCHW':
# input format: NCHW, filter format: OIHW
output_shape[1] = filter_shape[0]
output_shape[2] = int(
round_func((input_shape[2] + paddings[0] - filter_shape[2] -
(filter_shape[2] - 1) *
(dilations[0] - 1)) / float(strides[0]))) + 1
output_shape[3] = int(
round_func((input_shape[3] + paddings[1] - filter_shape[3] -
(filter_shape[3] - 1) *
(dilations[1] - 1)) / float(strides[1]))) + 1
else:
raise Exception("format %s is not supported" % input_format)
return output_shape
@staticmethod
def fully_connected_shape(input_shape, weight_shape):
return [input_shape[0], 1, 1, weight_shape[0]]
@staticmethod
def concat_shape(input_shapes, axis):
output_shape = None
for input_shape in input_shapes:
if output_shape is None:
output_shape = list(input_shape)
else:
output_shape[axis] += input_shape[axis]
return output_shape
@staticmethod
def slice_shape(input_shape, num_output, input_format='NHWC'):
if input_format == 'NHWC':
return [
input_shape[0], input_shape[1], input_shape[2],
input_shape[3] / num_output
]
elif input_format == 'NCHW':
return [
input_shape[0], input_shape[1] / num_output, input_shape[2],
input_shape[3]
]
else:
raise Exception("format %s is not supported" % input_format)
# outputs' name is [op.name + '_' + #]
class CaffeConverter(object):
def __init__(self, caffe_net, weights, net_def, dt, device, winograd):
self.net_def = net_def
self.caffe_net = caffe_net
self.weights = weights
self.dt = dt
self.device = device
self.winograd = winograd
self.resolved_ops = set()
self.ops = []
self.inputs_map = {} # caffe op name -> mace inputs' name
# Add Input operations
top_name_map = {}
inputs = caffe_net.input
for input in inputs:
self.ops.extend([Operator(input, 'Input', None)])
top_name_map[input] = input
layers = caffe_net.layer
# remove train layers and dropout
layers = self.remove_unused_layers(layers)
# Construct graph
# Only support single-output layer
# layer with single output often use the same top name.
self.ops.extend([Operator(layer.name, layer.type, layer) for layer in layers])
self.ops_map = {op.name : op for op in self.ops}
output_op_map = {}
for layer in layers:
op = self.ops_map[layer.name]
for input_name in layer.bottom:
assert input_name != layer.name
parent_op = output_op_map.get(input_name)
if parent_op is None:
parent_op = self.ops_map[input_name]
op.add_parent(parent_op)
if op.name not in self.inputs_map:
self.inputs_map[op.name] = []
self.inputs_map[op.name].extend([top_name_map[input_name]])
for i in range(len(layer.top)):
output_name = layer.top[i]
if len(layer.top) == 1:
top_name_map[output_name] = op.name
def __init__(self, caffe_net, weights, net_def, dt, device, winograd):
self.net_def = net_def
self.caffe_net = caffe_net
self.weights = weights
self.dt = dt
self.device = device
self.winograd = winograd
self.resolved_ops = set()
self.ops = []
self.inputs_map = {} # caffe op name -> mace inputs' name
# Add Input operations
top_name_map = {}
inputs = caffe_net.input
for input in inputs:
self.ops.extend([Operator(input, 'Input', None)])
top_name_map[input] = input
layers = caffe_net.layer
# remove train layers and dropout
layers = self.remove_unused_layers(layers)
# Construct graph
# Only support single-output layer
# layer with single output often use the same top name.
self.ops.extend(
[Operator(layer.name, layer.type, layer) for layer in layers])
self.ops_map = {op.name: op for op in self.ops}
output_op_map = {}
for layer in layers:
op = self.ops_map[layer.name]
for input_name in layer.bottom:
assert input_name != layer.name
parent_op = output_op_map.get(input_name)
if parent_op is None:
parent_op = self.ops_map[input_name]
op.add_parent(parent_op)
if op.name not in self.inputs_map:
self.inputs_map[op.name] = []
self.inputs_map[op.name].extend([top_name_map[input_name]])
for i in range(len(layer.top)):
output_name = layer.top[i]
if len(layer.top) == 1:
top_name_map[output_name] = op.name
else:
top_name_map[output_name] = op.name + '_' + str(i)
if output_name == layer.name:
continue
output_op_map[output_name] = op
# Load weights
weights_layers = weights.layer
for layer in weights_layers:
if not layer.blobs:
continue
if layer.name in self.ops_map:
op = self.ops_map[layer.name]
op.data = [BlobToNPArray(blob) for blob in layer.blobs]
# toposort ops
self.ops = self.toposort_ops()
def CommonConvert(self, op, mace_type):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
top_name_map[output_name] = op.name + '_' + str(i)
if output_name == layer.name:
continue
output_op_map[output_name] = op
# Load weights
weights_layers = weights.layer
for layer in weights_layers:
if not layer.blobs:
continue
if layer.name in self.ops_map:
op = self.ops_map[layer.name]
op.data = [BlobToNPArray(blob) for blob in layer.blobs]
# toposort ops
self.ops = self.toposort_ops()
def CommonConvert(self, op, mace_type):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
op_def.name = op.name
op_def.type = mace_type
op_def.input.extend([name+':0' for name in self.inputs_map[op.name]])
return op_def
def remove_unused_layers(self, layers):
phase_map = {0: 'train', 1: 'test'}
test_layers_names = set()
test_layers = []
for layer in layers:
phase = 'test'
if len(layer.include):
phase = phase_map[layer.include[0].phase]
if len(layer.exclude):
phase = phase_map[layer.exclude[0].phase]
if phase == 'test' and layer.type != 'Dropout':
test_layers.append(layer)
assert layer.name not in test_layers_names
test_layers_names.add(layer.name)
return test_layers
def toposort_ops(self):
sorted_ops = []
temp_visited = set()
visited = set()
def search(op):
if op.name in temp_visited:
raise Exception("The model is not DAG")
if op.name in visited:
return
temp_visited.add(op.name)
for parent_op in op.parents:
search(parent_op)
temp_visited.remove(op.name)
sorted_ops.append(op)
visited.add(op.name)
for op in self.ops:
search(op)
return sorted_ops
def add_buffer_to_image(self, input_name, input_type):
output_name = input_name[:-2] + "_b2i" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'BufferToImage'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'mode'
arg.i = 0
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_image_to_buffer(self, input_name, input_type):
output_name = input_name[:-2] + "_i2b" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'BufferToImage'
op_def.input.extend([new_input_name])
op_def.output.extend([name+':0'])
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
def add_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([name+':0'])
op_def.output.extend([output_name])
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
def add_tensor(self, name, value):
tensor = self.net_def.tensors.add()
tensor.name = name
shape = list(value.shape)
tensor.dims.extend(shape)
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(value.flat)
@staticmethod
def add_output_shape(op_def, output_shape):
mace_output_shape = mace_pb2.OutputShape()
mace_output_shape.dims.extend(output_shape)
op_def.output_shape.extend([mace_output_shape])
def add_stride_pad_kernel_arg(self, param, op_def):
try:
if len(param.stride) > 1 or len(param.kernel_size) > 1 or len(param.pad) > 1:
raise Exception('Mace does not support multiple stride/kernel_size/pad')
stride = [param.stride[0], param.stride[0]] if len(param.stride) else [1, 1]
pad = [param.pad[0] * 2, param.pad[0] * 2] if len(param.pad) else [0, 0]
kernel = [param.kernel_size[0], param.kernel_size[0]] if len(param.kernel_size) else [0, 0]
except TypeError:
stride = [param.stride, param.stride]
pad = [param.pad * 2, param.pad * 2]
kernel = [param.kernel_size, param.kernel_size]
if param.HasField("stride_h") or param.HasField("stride_w"):
stride = [param.stride_h, param.stride_w]
# Pad
if param.HasField("pad_h") or param.HasField("pad_w"):
pad = [param.pad_h * 2, param.pad_w * 2]
if op_def is not None:
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(stride)
padding_arg = op_def.arg.add()
padding_arg.name = 'padding_values'
padding_arg.ints.extend(pad)
if op_def.type == 'Pooling':
if param.HasField("kernel_h") or param.HasField("kernel_w"):
kernel = [param.kernel_h, param.kernel_w]
return pad, stride, kernel
def convert_conv2d(self, op):
param = op.layer.convolution_param
is_depthwise = False
if param.HasField('group'):
if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1:
is_depthwise = True
else:
raise Exception("Mace do not support group convolution yet")
if is_depthwise:
op_def = self.CommonConvert(op, 'DepthwiseConv2d')
else:
op_def = self.CommonConvert(op, 'Conv2D')
data_format_arg.s = 'NHWC'
op_def.name = op.name
op_def.type = mace_type
op_def.input.extend([name + ':0' for name in self.inputs_map[op.name]])
return op_def
def remove_unused_layers(self, layers):
phase_map = {0: 'train', 1: 'test'}
test_layers_names = set()
test_layers = []
for layer in layers:
phase = 'test'
if len(layer.include):
phase = phase_map[layer.include[0].phase]
if len(layer.exclude):
phase = phase_map[layer.exclude[0].phase]
if phase == 'test' and layer.type != 'Dropout':
test_layers.append(layer)
assert layer.name not in test_layers_names
test_layers_names.add(layer.name)
return test_layers
def toposort_ops(self):
sorted_ops = []
temp_visited = set()
visited = set()
def search(op):
if op.name in temp_visited:
raise Exception("The model is not DAG")
if op.name in visited:
return
temp_visited.add(op.name)
for parent_op in op.parents:
search(parent_op)
temp_visited.remove(op.name)
sorted_ops.append(op)
visited.add(op.name)
for op in self.ops:
search(op)
return sorted_ops
def add_buffer_to_image(self, input_name, input_type):
output_name = input_name[:-2] + "_b2i" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'BufferToImage'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'mode'
arg.i = 0
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_image_to_buffer(self, input_name, input_type):
output_name = input_name[:-2] + "_i2b" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'BufferToImage'
op_def.input.extend([new_input_name])
op_def.output.extend([name + ':0'])
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
def add_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([name + ':0'])
op_def.output.extend([output_name])
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
def add_tensor(self, name, value):
tensor = self.net_def.tensors.add()
tensor.name = name
shape = list(value.shape)
tensor.dims.extend(shape)
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(value.flat)
@staticmethod
def add_output_shape(op_def, output_shape):
mace_output_shape = mace_pb2.OutputShape()
mace_output_shape.dims.extend(output_shape)
op_def.output_shape.extend([mace_output_shape])
def add_stride_pad_kernel_arg(self, param, op_def):
try:
if len(param.stride) > 1 or len(param.kernel_size) > 1 or len(
param.pad) > 1:
raise Exception(
'Mace does not support multiple stride/kernel_size/pad')
stride = [param.stride[0],
param.stride[0]] if len(param.stride) else [1, 1]
pad = [param.pad[0] * 2,
param.pad[0] * 2] if len(param.pad) else [0, 0]
kernel = [param.kernel_size[0], param.kernel_size[0]] if len(
param.kernel_size) else [0, 0]
except TypeError:
stride = [param.stride, param.stride]
pad = [param.pad * 2, param.pad * 2]
kernel = [param.kernel_size, param.kernel_size]
if param.HasField("stride_h") or param.HasField("stride_w"):
stride = [param.stride_h, param.stride_w]
# Pad
if param.HasField("pad_h") or param.HasField("pad_w"):
pad = [param.pad_h * 2, param.pad_w * 2]
if op_def is not None:
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(stride)
padding_arg = op_def.arg.add()
padding_arg.name = 'padding_values'
padding_arg.ints.extend(pad)
if op_def.type == 'Pooling':
if param.HasField("kernel_h") or param.HasField("kernel_w"):
kernel = [param.kernel_h, param.kernel_w]
return pad, stride, kernel
def convert_conv2d(self, op):
param = op.layer.convolution_param
is_depthwise = False
if param.HasField('group'):
if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1:
is_depthwise = True
else:
raise Exception("Mace do not support group convolution yet")
if is_depthwise:
op_def = self.CommonConvert(op, 'DepthwiseConv2d')
else:
op_def = self.CommonConvert(op, 'Conv2D')
# Add filter
weight_tensor_name = op.name + '_weight:0'
if self.device == 'neon':
weight_data = op.data[0]
else:
# OIHW -> HWOI
weight_data = op.data[0].transpose((2, 3, 0, 1))
self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'gpu':
buffer_type = "DW_CONV2D_FILTER" if is_depthwise else "CONV2D_FILTER"
output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
op_def.input.extend([output_name])
else:
op_def.input.extend([weight_tensor_name])
# Add Bias
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0'
bias_data = op.data[1].reshape(-1)
self.add_tensor(bias_tensor_name, bias_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([bias_tensor_name])
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, op_def)
dilations = [1, 1]
if len(param.dilation) > 0:
dilation_arg = op_def.arg.add()
dilation_arg.name = 'dilations'
if len(param.dilation) == 1:
dilations = [param.dilation[0], param.dilation[0]]
elif len(param.dilation) == 2:
dilations = [param.dilation[0], param.dilation[1]]
dilation_arg.ints.extend(dilations)
final_op = op
self.resolved_ops.add(op.name)
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(op.get_single_parent().output_shape_map[op.layer.bottom[0]],
weight_data.shape,
paddings, strides, dilations,
math.floor, input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
if not is_depthwise:
op_def.type = "FusedConv2D"
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name+':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def check_winograd_conv(self, op):
# TODO: support winograd conv on neon
if self.device == 'neon':
return False
param = op.layer.convolution_param
filter_shape = np.asarray(op.data[0].shape)
if self.device != 'neon':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
dilations = [1, 1]
if len(param.dilation) > 0:
if len(param.dilation) == 1:
dilations = [param.dilation[0], param.dilation[0]]
elif len(param.dilation) == 2:
dilations = [param.dilation[0], param.dilation[1]]
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, dilations, math.floor, input_format)
width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
if self.winograd and dilations[0] == 1 and (dilations[0] == dilations[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]):
if self.device == 'gpu':
return filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
elif self.device == 'neon':
return filter_shape[2] == 3 and (filter_shape[2] == filter_shape[3])
return False
def convert_winograd_conv(self, op):
# Add filter
weight_tensor_name = op.name + '_weight:0'
self.add_tensor(weight_tensor_name, op.data[0])
buffer_type = "WINOGRAD_FILTER"
filter_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
param = op.layer.convolution_param
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
filter_shape = np.asarray(op.data[0].shape)
if self.device != 'neon':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, [1, 1], math.floor, input_format)
# Input transform
wt_op = mace_pb2.OperatorDef()
arg = wt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
padding_arg = wt_op.arg.add()
padding_arg.name = 'padding_values'
padding_arg.ints.extend(paddings)
wt_op.name = op.name + '_input_transform'
wt_op.type = 'WinogradTransform'
wt_op.input.extend([name+':0' for name in self.inputs_map[op.name]])
wt_output_name = wt_op.name + ":0"
wt_op.output.extend([wt_output_name])
wt_output_shape = mace_pb2.OutputShape()
if self.device != 'neon':
wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1])
else:
wt_output_width = output_shape[0] * ((output_shape[2] + 1)/2) * ((output_shape[3]+1)/2)
wt_output_shape.dims.extend([16, filter_shape[1], wt_output_width, 1])
wt_op.output_shape.extend([wt_output_shape])
# MatMul
matmul_op = mace_pb2.OperatorDef()
arg = matmul_op.arg.add()
arg.name = 'T'
arg.i = self.dt
matmul_op.name = op.name + '_matmul'
matmul_op.type = 'MatMul'
matmul_op.input.extend([filter_name, wt_output_name])
matmul_output_name = matmul_op.name + ":0"
matmul_op.output.extend([matmul_output_name])
matmul_output_shape = mace_pb2.OutputShape()
if self.device != 'neon':
matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
else:
matmul_output_shape.dims.extend([16, filter_shape[0], wt_output_width, 1])
matmul_op.output_shape.extend([matmul_output_shape])
# Inverse transform
iwt_op = mace_pb2.OperatorDef()
arg = iwt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
batch_arg = iwt_op.arg.add()
batch_arg.name = 'batch'
batch_arg.i = output_shape[0]
height_arg = iwt_op.arg.add()
height_arg.name = 'height'
height_arg.i = output_shape[1] if self.device != 'neon' else output_shape[2]
width_arg = iwt_op.arg.add()
width_arg.name = 'width'
width_arg.i = output_shape[2] if self.device != 'neon' else output_shape[3]
iwt_op.name = op.name + '_inverse_transform'
iwt_op.type = 'WinogradInverseTransform'
iwt_op.input.extend([matmul_output_name])
# Add Bias
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0'
bias_data = op.data[1].reshape(-1)
self.add_tensor(bias_tensor_name, bias_data)
output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT")
iwt_op.input.extend([output_name])
final_op = op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(op.name)
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = iwt_op.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
iwt_op.output.extend([final_op.name+':0'])
self.add_output_shape(iwt_op, output_shape)
self.net_def.op.extend([wt_op, matmul_op, iwt_op])
def convert_batchnorm(self, op):
if len(op.children) != 1 or op.children[0].type != 'Scale':
raise Exception('Now only support BatchNorm+Scale')
op_def = self.CommonConvert(op, 'FoldedBatchNorm')
scale_op = op.children[0]
epsilon_value = op.layer.batch_norm_param.eps
if op.data[2][0] != 0:
mean_value = (1. / op.data[2][0]) * op.data[0]
var_value = (1. / op.data[2][0]) * op.data[1]
else:
raise RuntimeError('scalar is zero.')
gamma_value = scale_op.data[0]
beta_value = np.zeros_like(mean_value)
if len(scale_op.data) == 2:
beta_value = scale_op.data[1]
scale_value = (
(1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
gamma_value).reshape(-1)
offset_value = ((-mean_value * scale_value) + beta_value).reshape(-1)
input_names = [op.name+'_scale:0', op.name+'_offset:0']
self.add_tensor(input_names[0], scale_value)
self.add_tensor(input_names[1], offset_value)
if self.device == 'gpu':
for name in input_names:
output_name = self.add_buffer_to_image(name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([name for name in input_names])
self.resolved_ops.add(op.name)
self.resolved_ops.add(scale_op.name)
final_op = scale_op
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def convert_inner_product(self, op):
param = op.layer.inner_product_param
try:
if param.axis != 1 or param.transpose:
raise ValueError('Do not support non-default axis and transpose '
'case for innner product')
except AttributeError:
pass
op_def = self.CommonConvert(op, 'FC')
weight_tensor_name = op.name + '_weight:0'
if op.data[0].ndim not in [2, 4]:
raise ValueError('Unexpected weigth ndim.')
if op.data[0].ndim == 4 and list(op.data[0].shape[:2]) != [1, 1]:
raise ValueError('Do not support 4D weight with shape [1, 1, *, *]')
input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
weight_data = op.data[0].reshape(-1, op.data[0].shape[-1])
assert weight_data.shape[1] == (input_shape[1] * input_shape[2] * input_shape[3])
if self.device != 'neon':
weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2])
weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1)
self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'gpu':
if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE \
and (weight_data.shape[1] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE:
raise Exception('Mace gpu do not support FC with weight shape: '
+str(weight_data.shape))
if input_shape[3] % 4 == 0:
buffer_type = "WEIGHT_WIDTH"
else:
buffer_type = "WEIGHT_HEIGHT"
weight_type_arg = op_def.arg.add()
weight_type_arg.name = 'weight_type'
weight_type_arg.i = buffer_type_map['WEIGHT_HEIGHT']
if buffer_type == "WEIGHT_HEIGHT" and \
(weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE:
raise Exception('Mace gpu do not support FC with weight shape: '
+str(weight_data.shape))
output_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
op_def.input.extend([output_name])
else:
op_def.input.extend([weight_tensor_name])
# Add Bias
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0'
bias_data = op.data[1].reshape(-1)
self.add_tensor(bias_tensor_name, bias_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(bias_tensor_name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([bias_tensor_name])
self.resolved_ops.add(op.name)
output_shape = Shapes.fully_connected_shape(input_shape, weight_data.shape)
op.output_shape_map[op.layer.top[0]] = output_shape
final_op = op
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def convert_pooling(self, op):
op_def = self.CommonConvert(op, 'Pooling')
param = op.layer.pooling_param
paddings, strides, kernels = self.add_stride_pad_kernel_arg(param, op_def)
if param.pool == caffe_pb2.PoolingParameter.MAX:
pooling_type = "MaxPool"
elif param.pool == caffe_pb2.PoolingParameter.AVE:
pooling_type = "AvgPool"
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[pooling_type]
input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
if param.HasField('global_pooling') and param.global_pooling:
kernels = [input_shape[1], input_shape[2]]
kernel_arg = op_def.arg.add()
kernel_arg.name = 'kernels'
kernel_arg.ints.extend(kernels)
filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]] \
if self.device != 'neon' else \
[input_shape[1], input_shape[1], kernels[0], kernels[1]]
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
paddings, strides, [1, 1], math.ceil, input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
op_def.output.extend([op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_activation(self, op):
op_def = self.CommonConvert(op, 'Activation')
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = activation_name_map[op.type]
op_def.output.extend([op.name + ':0'])
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_prelu(self, op):
op_def = self.CommonConvert(op, 'Activation')
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = 'PRELU'
alpha_tensor_name = op.name + '_alpha:0'
alpha_data = op.data[0].reshape(-1)
self.add_tensor(alpha_tensor_name, alpha_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(alpha_tensor_name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([alpha_tensor_name])
op_def.output.extend([op.name + ':0'])
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_add(self, op):
op_def = self.CommonConvert(op, 'AddN')
op_def.output.extend([op.name + ':0'])
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_concat(self, op):
op_def = self.CommonConvert(op, 'Concat')
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = 3 if self.device != 'neon' else 1
try:
if op.layer.concat_param.HasFeild('axis'):
axis_arg.i = op.concat_param.axis
elif op.layer.concat_param.HasFeild('concat_dim'):
axis_arg.i = op.concat_param.concat_dim
except AttributeError:
pass
# Add filter
weight_tensor_name = op.name + '_weight:0'
if self.device == 'neon':
weight_data = op.data[0]
else:
# OIHW -> HWOI
weight_data = op.data[0].transpose((2, 3, 0, 1))
self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'gpu':
buffer_type = "DW_CONV2D_FILTER" \
if is_depthwise else "CONV2D_FILTER"
output_name = self.add_buffer_to_image(weight_tensor_name,
buffer_type)
op_def.input.extend([output_name])
else:
op_def.input.extend([weight_tensor_name])
# Add Bias
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0'
bias_data = op.data[1].reshape(-1)
self.add_tensor(bias_tensor_name, bias_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(bias_tensor_name,
"ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([bias_tensor_name])
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, op_def)
dilations = [1, 1]
if len(param.dilation) > 0:
dilation_arg = op_def.arg.add()
dilation_arg.name = 'dilations'
if len(param.dilation) == 1:
dilations = [param.dilation[0], param.dilation[0]]
elif len(param.dilation) == 2:
dilations = [param.dilation[0], param.dilation[1]]
dilation_arg.ints.extend(dilations)
final_op = op
self.resolved_ops.add(op.name)
input_shapes = []
for i in range(len(op.parents)):
input_shapes.append(op.parents[i].output_shape_map[op.layer.bottom[i]])
output_shape = Shapes.concat_shape(input_shapes, axis_arg.i)
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_eltwise(self, op):
op_def = self.CommonConvert(op, 'Eltwise')
param = op.layer.eltwise_param
type_arg = op_def.arg.add()
type_arg.name = 'type'
type_arg.i = param.operation
if len(param.coeff) > 0:
coeff_arg = op_def.arg.add()
coeff_arg.name = 'coeff'
coeff_arg.ints.extend(list(param.coeff))
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_slice(self, op):
op_def = self.CommonConvert(op, 'Slice')
if op.layer.HasField('slice_param'):
param = op.layer.slice_param
if param.HasField('axis') and param.axis != 1:
raise Exception('Mace do not support slice with axis ' + str(param.axis))
if len(param.slice_point) > 0:
raise Exception('Mace do not support slice with slice_point')
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = 3 if self.device != 'neon' else 1
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
num_outputs = len(op.layer.top)
input_channels = input_shape[axis_arg.i]
if (input_channels % num_outputs) != 0 or \
(self.device == 'gpu' and ((input_channels / num_outputs) % 4 != 0)):
raise Exception('Mace do not support slice with input shape '
+ str(input_shape) + ' and number of output ' + str(num_outputs))
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.slice_shape(input_shape, num_outputs, input_format)
for i in range(len(op.layer.top)):
op.output_shape_map[op.layer.top[i]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + '_' + str(i) + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_normal_op(self, op):
op_def = self.CommonConvert(op, op.type)
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_reshape(self, op):
if self.device == 'neon':
op_def = self.CommonConvert(op, 'Reshape')
else:
op_def = self.CommonConvert(op, 'ReOrganize')
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
output_shape = input_shape
shape_param = np.asarray(op.layer.reshape_param.shape.dim)
if self.device != 'neon':
shape_param = shape_param[[0, 3, 1, 2]]
for i in range(len(shape_param)):
if shape_param[i] != 0:
output_shape[i] = shape_param[i]
shape_arg = op_def.arg.add()
shape_arg.name = 'shape'
shape_arg.ints.extend(output_shape)
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_proposal_op(self, op):
assert self.device == 'cpu'
op_def = self.CommonConvert(op, op.type)
if op.layer.HasField('proposal_param'):
proposal_param = op.layer.proposal_param
feat_stride_arg = op_def.arg.add()
feat_stride_arg.name = 'feat_stride'
feat_stride_arg.i = proposal_param.feat_stride
scales_arg = op_def.arg.add()
scales_arg.name = 'scales'
scales_arg.ints.extend(list(proposal_param.scales))
ratios_arg = op_def.arg.add()
ratios_arg.name = 'ratios'
ratios_arg.floats.extend(list(proposal_param.ratios))
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_psroi_align(self, op):
assert self.device == 'cpu'
op_def = self.CommonConvert(op, op.type)
if op.layer.HasField('psroi_align_param'):
psroi_align_param = op.layer.psroi_align_param
spatial_scale_arg = op_def.arg.add()
spatial_scale_arg.name = 'spatial_scale'
spatial_scale_arg.f = psroi_align_param.spatial_scale
output_dim_arg = op_def.arg.add()
output_dim_arg.name = 'output_dim'
output_dim_arg.i = psroi_align_param.output_dim
group_size_arg = op_def.arg.add()
group_size_arg.name = 'group_size'
group_size_arg.i = psroi_align_param.group_size
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def replace_in_out_name(self, input_names, output_names):
in_names = set([input_name + ":0" for input_name in input_names])
out_names = set([output_name + ":0" for output_name in output_names])
for op in self.net_def.op:
for i in range(len(op.input)):
if op.input[i] in in_names:
op.input[i] = MACE_INPUT_NODE_NAME + '_' + op.input[i]
if op.input[i] in out_names:
op.input[i] = MACE_OUTPUT_NODE_NAME + '_' + op.input[i]
for i in range(len(op.output)):
if op.output[i] in in_names:
op.output[i] = MACE_INPUT_NODE_NAME + '_' + op.output[i]
if op.output[i] in out_names:
op.output[i] = MACE_OUTPUT_NODE_NAME + '_' + op.output[i]
def add_input_op_shape(self, input_nodes, input_shapes):
assert len(input_nodes) == len(input_shapes)
for i in range(len(input_nodes)):
input_op = self.ops_map[input_nodes[i]]
input_shape = input_shapes[i] if self.device != 'neon' else \
[input_shapes[i][0], input_shapes[i][3], input_shapes[i][1], input_shapes[i][2]]
if input_op.layer is not None:
input_op.output_shape_map[input_op.layer.top[0]] = input_shape
else:
input_op.output_shape_map[input_op.name] = input_shape
def add_neon_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'Transpose'
op_def.input.extend([new_input_name])
op_def.output.extend([name+':0'])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 3, 1, 2]) # NHWC -> NCHW
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
def add_neon_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'Transpose'
op_def.input.extend([name+':0'])
op_def.output.extend([output_name])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC
def convert(self, input_nodes, input_shapes, output_nodes):
if self.device == 'gpu':
self.add_input_transform(input_nodes)
if self.device == 'neon':
self.add_neon_input_transform(input_nodes)
assert self.ops[0].type == 'Input'
self.add_input_op_shape(input_nodes, input_shapes)
for op in self.ops:
if op.name in self.resolved_ops:
continue
if op.type == 'Input':
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
weight_data.shape, paddings, strides, dilations, math.floor,
input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
if len(self.ops_map[final_op.name].children) == 1 and \
self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
if not is_depthwise:
op_def.type = "FusedConv2D"
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def check_winograd_conv(self, op):
# TODO: support winograd conv on neon
if self.device == 'neon':
return False
param = op.layer.convolution_param
filter_shape = np.asarray(op.data[0].shape)
if self.device != 'neon':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
dilations = [1, 1]
if len(param.dilation) > 0:
if len(param.dilation) == 1:
dilations = [param.dilation[0], param.dilation[0]]
elif len(param.dilation) == 2:
dilations = [param.dilation[0], param.dilation[1]]
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, dilations, math.floor,
input_format)
width = output_shape[0] * ((output_shape[1] + 1) / 2) * ((
output_shape[2] + 1) / 2)
if self.winograd and dilations[0] == 1 and \
(dilations[0] == dilations[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]):
if self.device == 'gpu':
return filter_shape[0] == 3 and \
(filter_shape[0] == filter_shape[1]) and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
elif self.device == 'neon':
return filter_shape[2] == 3 and (
filter_shape[2] == filter_shape[3])
return False
def convert_winograd_conv(self, op):
# Add filter
weight_tensor_name = op.name + '_weight:0'
self.add_tensor(weight_tensor_name, op.data[0])
buffer_type = "WINOGRAD_FILTER"
filter_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
param = op.layer.convolution_param
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
filter_shape = np.asarray(op.data[0].shape)
if self.device != 'neon':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, [1, 1], math.floor, input_format)
# Input transform
wt_op = mace_pb2.OperatorDef()
arg = wt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
padding_arg = wt_op.arg.add()
padding_arg.name = 'padding_values'
padding_arg.ints.extend(paddings)
wt_op.name = op.name + '_input_transform'
wt_op.type = 'WinogradTransform'
wt_op.input.extend([name + ':0' for name in self.inputs_map[op.name]])
wt_output_name = wt_op.name + ":0"
wt_op.output.extend([wt_output_name])
wt_output_shape = mace_pb2.OutputShape()
if self.device != 'neon':
wt_output_width = output_shape[0] * ((
output_shape[1] + 1) / 2) * ((output_shape[2] + 1) / 2)
wt_output_shape.dims.extend(
[16, filter_shape[3], wt_output_width, 1])
else:
wt_output_width = output_shape[0] * ((
output_shape[2] + 1) / 2) * ((output_shape[3] + 1) / 2)
wt_output_shape.dims.extend(
[16, filter_shape[1], wt_output_width, 1])
wt_op.output_shape.extend([wt_output_shape])
# MatMul
matmul_op = mace_pb2.OperatorDef()
arg = matmul_op.arg.add()
arg.name = 'T'
arg.i = self.dt
matmul_op.name = op.name + '_matmul'
matmul_op.type = 'MatMul'
matmul_op.input.extend([filter_name, wt_output_name])
matmul_output_name = matmul_op.name + ":0"
matmul_op.output.extend([matmul_output_name])
matmul_output_shape = mace_pb2.OutputShape()
if self.device != 'neon':
matmul_output_shape.dims.extend(
[16, filter_shape[2], wt_output_width, 1])
else:
matmul_output_shape.dims.extend(
[16, filter_shape[0], wt_output_width, 1])
matmul_op.output_shape.extend([matmul_output_shape])
# Inverse transform
iwt_op = mace_pb2.OperatorDef()
arg = iwt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
batch_arg = iwt_op.arg.add()
batch_arg.name = 'batch'
batch_arg.i = output_shape[0]
height_arg = iwt_op.arg.add()
height_arg.name = 'height'
height_arg.i = output_shape[
1] if self.device != 'neon' else output_shape[2]
width_arg = iwt_op.arg.add()
width_arg.name = 'width'
width_arg.i = output_shape[
2] if self.device != 'neon' else output_shape[3]
iwt_op.name = op.name + '_inverse_transform'
iwt_op.type = 'WinogradInverseTransform'
iwt_op.input.extend([matmul_output_name])
# Add Bias
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0'
bias_data = op.data[1].reshape(-1)
self.add_tensor(bias_tensor_name, bias_data)
output_name = self.add_buffer_to_image(bias_tensor_name,
"ARGUMENT")
iwt_op.input.extend([output_name])
final_op = op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(op.name)
elif op.type == 'Convolution':
if self.check_winograd_conv(op):
self.convert_winograd_conv(op)
if len(self.ops_map[final_op.name].children) == 1 and \
self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = iwt_op.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
iwt_op.output.extend([final_op.name + ':0'])
self.add_output_shape(iwt_op, output_shape)
self.net_def.op.extend([wt_op, matmul_op, iwt_op])
def convert_batchnorm(self, op):
if len(op.children) != 1 or op.children[0].type != 'Scale':
raise Exception('Now only support BatchNorm+Scale')
op_def = self.CommonConvert(op, 'FoldedBatchNorm')
scale_op = op.children[0]
epsilon_value = op.layer.batch_norm_param.eps
if op.data[2][0] != 0:
mean_value = (1. / op.data[2][0]) * op.data[0]
var_value = (1. / op.data[2][0]) * op.data[1]
else:
self.convert_conv2d(op)
elif op.type == 'BatchNorm':
self.convert_batchnorm(op)
elif op.type == 'InnerProduct':
self.convert_inner_product(op)
elif op.type == 'Pooling':
self.convert_pooling(op)
elif op.type == 'PReLU':
self.convert_prelu(op)
elif op.type in ['ReLU', 'Sigmoid', 'TanH']:
self.convert_activation(op)
elif op.type == 'Add':
self.convert_add(op)
elif op.type == 'Concat':
self.convert_concat(op)
elif op.type == 'Eltwise':
self.convert_eltwise(op)
elif op.type == 'Slice':
self.convert_slice(op)
elif op.type == 'Reshape':
self.convert_reshape(op)
elif op.type == 'Proposal':
self.convert_proposal_op(op)
elif op.type == 'PSROIAlign':
self.convert_psroi_align(op)
elif op.type in ['Softmax']:
self.convert_normal_op(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
if self.device == 'gpu':
self.add_output_transform(output_nodes)
if self.device == 'cpu':
self.replace_in_out_name(input_nodes, output_nodes)
if self.device == 'neon':
self.add_neon_output_transform(output_nodes)
for op in self.ops:
if op.name not in self.resolved_ops:
print 'Unresolve Op: %s with type %s' % (op.name, op.type)
def convert_to_mace_pb(model_file, weight_file, input_node_str, input_shape_str,
output_node_str, data_type, device, winograd):
net_def = mace_pb2.NetDef()
dt = data_type_map[data_type]
caffe_net = caffe_pb2.NetParameter()
with open(model_file, "r") as f:
google.protobuf.text_format.Merge(str(f.read()), caffe_net)
weights = caffe_pb2.NetParameter()
with open(weight_file, "rb") as f:
weights.MergeFromString(f.read())
input_nodes = [x for x in input_node_str.split(',')]
input_shapes = []
if input_shape_str != "":
input_shape_strs = [x for x in input_shape_str.split(':')]
for shape_str in input_shape_strs:
input_shapes.extend([[int(x) for x in shape_str.split(',')]])
output_nodes = [x for x in output_node_str.split(',')]
assert len(input_nodes) == len(input_shapes)
converter = CaffeConverter(caffe_net, weights, net_def, dt, device, winograd)
converter.convert(input_nodes, input_shapes, output_nodes)
print "PB Converted."
if device == 'gpu':
print "start optimize memory."
mem_optimizer = memory_optimizer.MemoryOptimizer(net_def)
mem_optimizer.optimize()
print "Memory optimization done."
return net_def
raise RuntimeError('scalar is zero.')
gamma_value = scale_op.data[0]
beta_value = np.zeros_like(mean_value)
if len(scale_op.data) == 2:
beta_value = scale_op.data[1]
scale_value = ((
1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
gamma_value).reshape(-1)
offset_value = ((-mean_value * scale_value) + beta_value).reshape(-1)
input_names = [op.name + '_scale:0', op.name + '_offset:0']
self.add_tensor(input_names[0], scale_value)
self.add_tensor(input_names[1], offset_value)
if self.device == 'gpu':
for name in input_names:
output_name = self.add_buffer_to_image(name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([name for name in input_names])
self.resolved_ops.add(op.name)
self.resolved_ops.add(scale_op.name)
final_op = scale_op
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def convert_inner_product(self, op):
param = op.layer.inner_product_param
try:
if param.axis != 1 or param.transpose:
raise ValueError(
'Do not support non-default axis and transpose '
'case for innner product')
except AttributeError:
pass
op_def = self.CommonConvert(op, 'FC')
weight_tensor_name = op.name + '_weight:0'
if op.data[0].ndim not in [2, 4]:
raise ValueError('Unexpected weigth ndim.')
if op.data[0].ndim == 4 and list(op.data[0].shape[:2]) != [1, 1]:
raise ValueError(
'Do not support 4D weight with shape [1, 1, *, *]')
input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
weight_data = op.data[0].reshape(-1, op.data[0].shape[-1])
assert weight_data.shape[1] == (
input_shape[1] * input_shape[2] * input_shape[3])
if self.device != 'neon':
weight_data = weight_data.reshape(-1, input_shape[3],
input_shape[1], input_shape[2])
weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(
weight_data.shape[0], -1)
self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'gpu':
if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE and \
(weight_data.shape[1] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE:
raise Exception(
'Mace gpu do not support FC with weight shape: ' +
str(weight_data.shape))
if input_shape[3] % 4 == 0:
buffer_type = "WEIGHT_WIDTH"
else:
buffer_type = "WEIGHT_HEIGHT"
weight_type_arg = op_def.arg.add()
weight_type_arg.name = 'weight_type'
weight_type_arg.i = buffer_type_map['WEIGHT_HEIGHT']
if buffer_type == "WEIGHT_HEIGHT" and \
(weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE:
raise Exception(
'Mace gpu do not support FC with weight shape: ' +
str(weight_data.shape))
output_name = self.add_buffer_to_image(weight_tensor_name,
buffer_type)
op_def.input.extend([output_name])
else:
op_def.input.extend([weight_tensor_name])
# Add Bias
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0'
bias_data = op.data[1].reshape(-1)
self.add_tensor(bias_tensor_name, bias_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(bias_tensor_name,
"ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([bias_tensor_name])
self.resolved_ops.add(op.name)
output_shape = Shapes.fully_connected_shape(input_shape,
weight_data.shape)
op.output_shape_map[op.layer.top[0]] = output_shape
final_op = op
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def convert_pooling(self, op):
op_def = self.CommonConvert(op, 'Pooling')
param = op.layer.pooling_param
paddings, strides, kernels = self.add_stride_pad_kernel_arg(
param, op_def)
if param.pool == caffe_pb2.PoolingParameter.MAX:
pooling_type = "MaxPool"
elif param.pool == caffe_pb2.PoolingParameter.AVE:
pooling_type = "AvgPool"
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[pooling_type]
input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
if param.HasField('global_pooling') and param.global_pooling:
kernels = [input_shape[1], input_shape[2]]
kernel_arg = op_def.arg.add()
kernel_arg.name = 'kernels'
kernel_arg.ints.extend(kernels)
if self.device != 'neon':
filter_shape = [
kernels[0], kernels[1], input_shape[3], input_shape[3]
]
else:
filter_shape = [
input_shape[1], input_shape[1], kernels[0], kernels[1]
]
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
paddings, strides, [1, 1],
math.ceil, input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
op_def.output.extend([op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_activation(self, op):
op_def = self.CommonConvert(op, 'Activation')
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = activation_name_map[op.type]
op_def.output.extend([op.name + ':0'])
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_prelu(self, op):
op_def = self.CommonConvert(op, 'Activation')
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = 'PRELU'
alpha_tensor_name = op.name + '_alpha:0'
alpha_data = op.data[0].reshape(-1)
self.add_tensor(alpha_tensor_name, alpha_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(alpha_tensor_name,
"ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([alpha_tensor_name])
op_def.output.extend([op.name + ':0'])
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_add(self, op):
op_def = self.CommonConvert(op, 'AddN')
op_def.output.extend([op.name + ':0'])
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_concat(self, op):
op_def = self.CommonConvert(op, 'Concat')
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = 3 if self.device != 'neon' else 1
try:
if op.layer.concat_param.HasFeild('axis'):
axis_arg.i = op.concat_param.axis
elif op.layer.concat_param.HasFeild('concat_dim'):
axis_arg.i = op.concat_param.concat_dim
except AttributeError:
pass
input_shapes = []
for i in range(len(op.parents)):
input_shapes.append(
op.parents[i].output_shape_map[op.layer.bottom[i]])
output_shape = Shapes.concat_shape(input_shapes, axis_arg.i)
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_eltwise(self, op):
op_def = self.CommonConvert(op, 'Eltwise')
param = op.layer.eltwise_param
type_arg = op_def.arg.add()
type_arg.name = 'type'
type_arg.i = param.operation
if len(param.coeff) > 0:
coeff_arg = op_def.arg.add()
coeff_arg.name = 'coeff'
coeff_arg.ints.extend(list(param.coeff))
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_slice(self, op):
op_def = self.CommonConvert(op, 'Slice')
if op.layer.HasField('slice_param'):
param = op.layer.slice_param
if param.HasField('axis') and param.axis != 1:
raise Exception(
'Mace do not support slice with axis ' + str(param.axis))
if len(param.slice_point) > 0:
raise Exception('Mace do not support slice with slice_point')
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = 3 if self.device != 'neon' else 1
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
num_outputs = len(op.layer.top)
input_channels = input_shape[axis_arg.i]
if (input_channels % num_outputs) != 0 or \
(self.device == 'gpu' and
((input_channels / num_outputs) % 4 != 0)):
raise Exception(
'Mace do not support slice with input shape ' +
str(input_shape) + ' and number of output ' + str(num_outputs))
input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
output_shape = Shapes.slice_shape(input_shape, num_outputs,
input_format)
for i in range(len(op.layer.top)):
op.output_shape_map[op.layer.top[i]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + '_' + str(i) + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_normal_op(self, op):
op_def = self.CommonConvert(op, op.type)
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_reshape(self, op):
if self.device == 'neon':
op_def = self.CommonConvert(op, 'Reshape')
else:
op_def = self.CommonConvert(op, 'ReOrganize')
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
output_shape = input_shape
shape_param = np.asarray(op.layer.reshape_param.shape.dim)
if self.device != 'neon':
shape_param = shape_param[[0, 3, 1, 2]]
for i in range(len(shape_param)):
if shape_param[i] != 0:
output_shape[i] = shape_param[i]
shape_arg = op_def.arg.add()
shape_arg.name = 'shape'
shape_arg.ints.extend(output_shape)
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_proposal_op(self, op):
assert self.device == 'cpu'
op_def = self.CommonConvert(op, op.type)
if op.layer.HasField('proposal_param'):
proposal_param = op.layer.proposal_param
feat_stride_arg = op_def.arg.add()
feat_stride_arg.name = 'feat_stride'
feat_stride_arg.i = proposal_param.feat_stride
scales_arg = op_def.arg.add()
scales_arg.name = 'scales'
scales_arg.ints.extend(list(proposal_param.scales))
ratios_arg = op_def.arg.add()
ratios_arg.name = 'ratios'
ratios_arg.floats.extend(list(proposal_param.ratios))
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_psroi_align(self, op):
assert self.device == 'cpu'
op_def = self.CommonConvert(op, op.type)
if op.layer.HasField('psroi_align_param'):
psroi_align_param = op.layer.psroi_align_param
spatial_scale_arg = op_def.arg.add()
spatial_scale_arg.name = 'spatial_scale'
spatial_scale_arg.f = psroi_align_param.spatial_scale
output_dim_arg = op_def.arg.add()
output_dim_arg.name = 'output_dim'
output_dim_arg.i = psroi_align_param.output_dim
group_size_arg = op_def.arg.add()
group_size_arg.name = 'group_size'
group_size_arg.i = psroi_align_param.group_size
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def replace_in_out_name(self, input_names, output_names):
in_names = set([input_name + ":0" for input_name in input_names])
out_names = set([output_name + ":0" for output_name in output_names])
for op in self.net_def.op:
for i in range(len(op.input)):
if op.input[i] in in_names:
op.input[i] = MACE_INPUT_NODE_NAME + '_' + op.input[i]
if op.input[i] in out_names:
op.input[i] = MACE_OUTPUT_NODE_NAME + '_' + op.input[i]
for i in range(len(op.output)):
if op.output[i] in in_names:
op.output[i] = MACE_INPUT_NODE_NAME + '_' + op.output[i]
if op.output[i] in out_names:
op.output[i] = MACE_OUTPUT_NODE_NAME + '_' + op.output[i]
def add_input_op_shape(self, input_nodes, input_shapes):
assert len(input_nodes) == len(input_shapes)
for i in range(len(input_nodes)):
input_op = self.ops_map[input_nodes[i]]
input_shape = input_shapes[i] if self.device != 'neon' else \
[input_shapes[i][0], input_shapes[i][3],
input_shapes[i][1], input_shapes[i][2]]
if input_op.layer is not None:
input_op.output_shape_map[input_op.layer.top[0]] = input_shape
else:
input_op.output_shape_map[input_op.name] = input_shape
def add_neon_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'Transpose'
op_def.input.extend([new_input_name])
op_def.output.extend([name + ':0'])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 3, 1, 2]) # NHWC -> NCHW
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
def add_neon_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'Transpose'
op_def.input.extend([name + ':0'])
op_def.output.extend([output_name])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC
def convert(self, input_nodes, input_shapes, output_nodes):
if self.device == 'gpu':
self.add_input_transform(input_nodes)
if self.device == 'neon':
self.add_neon_input_transform(input_nodes)
assert self.ops[0].type == 'Input'
self.add_input_op_shape(input_nodes, input_shapes)
for op in self.ops:
if op.name in self.resolved_ops:
continue
if op.type == 'Input':
self.resolved_ops.add(op.name)
elif op.type == 'Convolution':
if self.check_winograd_conv(op):
self.convert_winograd_conv(op)
else:
self.convert_conv2d(op)
elif op.type == 'BatchNorm':
self.convert_batchnorm(op)
elif op.type == 'InnerProduct':
self.convert_inner_product(op)
elif op.type == 'Pooling':
self.convert_pooling(op)
elif op.type == 'PReLU':
self.convert_prelu(op)
elif op.type in ['ReLU', 'Sigmoid', 'TanH']:
self.convert_activation(op)
elif op.type == 'Add':
self.convert_add(op)
elif op.type == 'Concat':
self.convert_concat(op)
elif op.type == 'Eltwise':
self.convert_eltwise(op)
elif op.type == 'Slice':
self.convert_slice(op)
elif op.type == 'Reshape':
self.convert_reshape(op)
elif op.type == 'Proposal':
self.convert_proposal_op(op)
elif op.type == 'PSROIAlign':
self.convert_psroi_align(op)
elif op.type in ['Softmax']:
self.convert_normal_op(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name,
op.type))
if self.device == 'gpu':
self.add_output_transform(output_nodes)
if self.device == 'cpu':
self.replace_in_out_name(input_nodes, output_nodes)
if self.device == 'neon':
self.add_neon_output_transform(output_nodes)
for op in self.ops:
if op.name not in self.resolved_ops:
print 'Unresolve Op: %s with type %s' % (op.name, op.type)
def convert_to_mace_pb(model_file, weight_file, input_node_str,
input_shape_str, output_node_str, data_type, device,
winograd):
net_def = mace_pb2.NetDef()
dt = data_type_map[data_type]
caffe_net = caffe_pb2.NetParameter()
with open(model_file, "r") as f:
google.protobuf.text_format.Merge(str(f.read()), caffe_net)
weights = caffe_pb2.NetParameter()
with open(weight_file, "rb") as f:
weights.MergeFromString(f.read())
input_nodes = [x for x in input_node_str.split(',')]
input_shapes = []
if input_shape_str != "":
input_shape_strs = [x for x in input_shape_str.split(':')]
for shape_str in input_shape_strs:
input_shapes.extend([[int(x) for x in shape_str.split(',')]])
output_nodes = [x for x in output_node_str.split(',')]
assert len(input_nodes) == len(input_shapes)
converter = CaffeConverter(caffe_net, weights, net_def, dt, device,
winograd)
converter.convert(input_nodes, input_shapes, output_nodes)
print "PB Converted."
if device == 'gpu':
print "start optimize memory."
mem_optimizer = memory_optimizer.MemoryOptimizer(net_def)
mem_optimizer.optimize()
print "Memory optimization done."
return net_def
......@@ -26,4 +26,3 @@ def tf_dtype_2_mace_dtype(tf_dtype):
if not mace_dtype:
raise Exception("Not supported tensorflow dtype: " + tf_dtype)
return mace_dtype
......@@ -4,176 +4,166 @@ import hashlib
import os.path
from mace.python.tools import source_converter_lib
# ./bazel-bin/mace/python/tools/tf_converter --model_file quantized_test.pb --output quantized_test_dsp.pb --runtime dsp --input_dim input_node,1,28,28,3
# ./bazel-bin/mace/python/tools/tf_converter --model_file quantized_test.pb \
# --output quantized_test_dsp.pb \
# --runtime dsp \
# --input_dim input_node,1,28,28,3
FLAGS = None
def file_checksum(fname):
hash_func = hashlib.sha256()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_func.update(chunk)
return hash_func.hexdigest()
hash_func = hashlib.sha256()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_func.update(chunk)
return hash_func.hexdigest()
def main(unused_args):
if not os.path.isfile(FLAGS.model_file):
print("Input graph file '" + FLAGS.model_file + "' does not exist!")
sys.exit(-1)
model_checksum = file_checksum(FLAGS.model_file)
if FLAGS.model_checksum != "" and FLAGS.model_checksum != model_checksum:
print("Model checksum mismatch: %s != %s" % (model_checksum, FLAGS.model_checksum))
sys.exit(-1)
if FLAGS.platform == 'caffe':
if not os.path.isfile(FLAGS.weight_file):
print("Input weight file '" + FLAGS.weight_file + "' does not exist!")
sys.exit(-1)
weight_checksum = file_checksum(FLAGS.weight_file)
if FLAGS.weight_checksum != "" and FLAGS.weight_checksum != weight_checksum:
print("Weight checksum mismatch: %s != %s" % (weight_checksum, FLAGS.weight_checksum))
sys.exit(-1)
if FLAGS.runtime == 'dsp':
print("DSP not support caffe model yet.")
sys.exit(-1)
from mace.python.tools import caffe_converter_lib
output_graph_def = caffe_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.weight_file, FLAGS.input_node, FLAGS.input_shape, FLAGS.output_node,
FLAGS.data_type, FLAGS.runtime, FLAGS.winograd)
elif FLAGS.platform == 'tensorflow':
if FLAGS.runtime == 'dsp':
from mace.python.tools import tf_dsp_converter_lib
output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.input_node, FLAGS.output_node, FLAGS.dsp_mode)
if not os.path.isfile(FLAGS.model_file):
print("Input graph file '" + FLAGS.model_file + "' does not exist!")
sys.exit(-1)
model_checksum = file_checksum(FLAGS.model_file)
if FLAGS.model_checksum != "" and FLAGS.model_checksum != model_checksum:
print("Model checksum mismatch: %s != %s" % (model_checksum,
FLAGS.model_checksum))
sys.exit(-1)
if FLAGS.platform == 'caffe':
if not os.path.isfile(FLAGS.weight_file):
print("Input weight file '" + FLAGS.weight_file +
"' does not exist!")
sys.exit(-1)
weight_checksum = file_checksum(FLAGS.weight_file)
if FLAGS.weight_checksum != "" and \
FLAGS.weight_checksum != weight_checksum:
print("Weight checksum mismatch: %s != %s" %
(weight_checksum, FLAGS.weight_checksum))
sys.exit(-1)
if FLAGS.runtime == 'dsp':
print("DSP not support caffe model yet.")
sys.exit(-1)
from mace.python.tools import caffe_converter_lib
output_graph_def = caffe_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.weight_file, FLAGS.input_node,
FLAGS.input_shape, FLAGS.output_node, FLAGS.data_type,
FLAGS.runtime, FLAGS.winograd)
elif FLAGS.platform == 'tensorflow':
if FLAGS.runtime == 'dsp':
from mace.python.tools import tf_dsp_converter_lib
output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.input_node, FLAGS.output_node,
FLAGS.dsp_mode)
else:
from mace.python.tools import tf_converter_lib
output_graph_def = tf_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.input_node, FLAGS.input_shape,
FLAGS.output_node, FLAGS.data_type, FLAGS.runtime,
FLAGS.winograd)
if FLAGS.output_type == 'source':
source_converter_lib.convert_to_source(
output_graph_def, model_checksum, FLAGS.template, FLAGS.obfuscate,
FLAGS.model_tag, FLAGS.output, FLAGS.runtime,
FLAGS.embed_model_data)
else:
from mace.python.tools import tf_converter_lib
output_graph_def = tf_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.input_node, FLAGS.input_shape, FLAGS.output_node,
FLAGS.data_type, FLAGS.runtime, FLAGS.winograd)
if FLAGS.output_type == 'source':
source_converter_lib.convert_to_source(output_graph_def, model_checksum, FLAGS.template, FLAGS.obfuscate,
FLAGS.model_tag, FLAGS.output, FLAGS.runtime, FLAGS.embed_model_data)
else:
with open(FLAGS.output, "wb") as f:
f.write(output_graph_def.SerializeToString())
with open(FLAGS.output + '_txt', "wb") as f:
# output_graph_def.ClearField('tensors')
f.write(str(output_graph_def))
print("Model conversion is completed.")
with open(FLAGS.output, "wb") as f:
f.write(output_graph_def.SerializeToString())
with open(FLAGS.output + '_txt', "wb") as f:
# output_graph_def.ClearField('tensors')
f.write(str(output_graph_def))
print("Model conversion is completed.")
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
def parse_args():
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--model_file",
type=str,
default="",
help="TensorFlow \'GraphDef\' file to load, Caffe prototxt file to load.")
parser.add_argument(
"--weight_file",
type=str,
default="",
help="Caffe data file to load.")
parser.add_argument(
"--model_checksum",
type=str,
default="",
help="Model file sha256 checksum")
parser.add_argument(
"--weight_checksum",
type=str,
default="",
help="Weight file sha256 checksum")
parser.add_argument(
"--output",
type=str,
default="",
help="File to save the output graph to.")
parser.add_argument(
"--runtime",
type=str,
default="cpu",
help="Runtime: cpu/gpu/dsp")
parser.add_argument(
"--input_node",
type=str,
default="input_node",
help="e.g., input_node")
parser.add_argument(
"--output_node",
type=str,
default="softmax",
help="e.g., softmax")
parser.add_argument(
"--data_type",
type=str,
default='DT_FLOAT',
help="e.g., DT_HALF/DT_FLOAT")
parser.add_argument(
"--output_type",
type=str,
default="pb",
help="output type: source/pb")
parser.add_argument(
"--template",
type=str,
default="",
help="template path")
parser.add_argument(
"--obfuscate",
type=str2bool,
nargs='?',
const=False,
default=False,
help="obfuscate model names")
parser.add_argument(
"--model_tag",
type=str,
default="",
help="model tag for generated function and namespace")
parser.add_argument(
"--winograd",
type=str2bool,
nargs='?',
const=False,
default=False,
help="open winograd convolution or not")
parser.add_argument(
"--dsp_mode",
type=int,
default=0,
help="dsp run mode, defalut=0")
parser.add_argument(
"--input_shape",
type=str,
default="",
help="input shape.")
parser.add_argument(
"--platform",
type=str,
default="tensorflow",
help="tensorflow/caffe")
parser.add_argument(
"--embed_model_data",
type=str2bool,
default=True,
help="input shape.")
return parser.parse_known_args()
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--model_file",
type=str,
default="",
help="TensorFlow \'GraphDef\' file to load, "
"Caffe prototxt file to load.")
parser.add_argument(
"--weight_file", type=str, default="", help="Caffe data file to load.")
parser.add_argument(
"--model_checksum",
type=str,
default="",
help="Model file sha256 checksum")
parser.add_argument(
"--weight_checksum",
type=str,
default="",
help="Weight file sha256 checksum")
parser.add_argument(
"--output",
type=str,
default="",
help="File to save the output graph to.")
parser.add_argument(
"--runtime", type=str, default="cpu", help="Runtime: cpu/gpu/dsp")
parser.add_argument(
"--input_node",
type=str,
default="input_node",
help="e.g., input_node")
parser.add_argument(
"--output_node", type=str, default="softmax", help="e.g., softmax")
parser.add_argument(
"--data_type",
type=str,
default='DT_FLOAT',
help="e.g., DT_HALF/DT_FLOAT")
parser.add_argument(
"--output_type", type=str, default="pb", help="output type: source/pb")
parser.add_argument(
"--template", type=str, default="", help="template path")
parser.add_argument(
"--obfuscate",
type=str2bool,
nargs='?',
const=False,
default=False,
help="obfuscate model names")
parser.add_argument(
"--model_tag",
type=str,
default="",
help="model tag for generated function and namespace")
parser.add_argument(
"--winograd",
type=str2bool,
nargs='?',
const=False,
default=False,
help="open winograd convolution or not")
parser.add_argument(
"--dsp_mode", type=int, default=0, help="dsp run mode, defalut=0")
parser.add_argument(
"--input_shape", type=str, default="", help="input shape.")
parser.add_argument(
"--platform", type=str, default="tensorflow", help="tensorflow/caffe")
parser.add_argument(
"--embed_model_data", type=str2bool, default=True, help="input shape.")
return parser.parse_known_args()
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
class DspOps(object):
def __init__(self):
self.dsp_ops = {
'INPUT': 'INPUT"',
'OUTPUT': 'OUTPUT',
'NoOp': 'Nop',
'FLATTEN': 'Flatten',
'Identity': 'Nop',
'Placeholder': 'INPUT',
'Const': 'Const',
'QuantizedConv2D': 'QuantizedConv2d_8x8to32',
'QuantizedMatMul': 'QuantizedMatMul_8x8to32',
'QuantizeDownAndShrinkRange': 'QuantizeDownAndShrinkRange_32to8',
'QuantizedRelu': 'QuantizedRelu_8',
'QuantizedReluX': 'QuantizedReluX_8',
'QuantizedMaxPool': 'QuantizedMaxPool_8',
'QuantizedAvgPool': 'QuantizedAvgPool_8',
'QuantizedConcat': 'QuantizedConcat_8',
'QuantizedBiasAdd': 'QuantizedBiasAdd_8p8to32',
'QuantizedResizeBilinear' : 'QuantizedResizeBilinear_8',
'QuantizedSpaceToBatchND': 'QuantizedSpaceToBatchND_8',
'QuantizedBatchToSpaceND': 'QuantizedBatchToSpaceND_8',
'QuantizedSoftmax': 'QuantizedSoftmax_8',
'QuantizedTanh': 'QuantizedTanh_8',
'Min': 'Min_f',
'Max': 'Max_f',
'QuantizeV2': 'Quantize',
'Dequantize': 'Dequantize',
'Softmax': 'Softmax_f',
'Reshape': 'Reshape',
'QuantizedReshape': 'QuantizedReshape',
'Sigmoid': 'Sigmoid_f',
'Slice': 'Slice_f',
'Add': 'Add_f',
'Mul': 'Mul_f',
'Requantize': 'Requantize_32to8',
'RequantizationRange': 'RequantizationRange_32',
'Sub': 'Sub_f',
'Pack': 'Pack_int32',
'StridedSlice': 'StridedSlice_f',
'ExpandDims': 'ExpandDims_f',
'QuantizedMul': 'QuantizedMul_8x8to32',
'QuantizedAdd': 'QuantizedAdd_8p8to32',
'Pad': 'Pad_f',
'SpaceToBatchND': 'SpaceToBatchND_f',
'BatchToSpaceND': 'BatchToSpaceND_f',
'ResizeBilinear': 'ResizeBilinear_f',
'ConcatV2': 'ConcatV2_f',
'Conv2DBackpropInput': 'Deconv_f',
'Tanh': 'Tanh_f',
'Split': 'Split_f',
'Transpose': 'Transpose_f',
'Concat': 'Concat_f',
'AddN': 'AddN_f',
}
def has_op(self, tf_op):
return tf_op in self.dsp_ops
def map_nn_op(self, tf_op):
if tf_op not in self.dsp_ops:
raise Exception('Could not map nn op for: ', tf_op)
return self.dsp_ops[tf_op]
def __init__(self):
self.dsp_ops = {
'INPUT': 'INPUT"',
'OUTPUT': 'OUTPUT',
'NoOp': 'Nop',
'FLATTEN': 'Flatten',
'Identity': 'Nop',
'Placeholder': 'INPUT',
'Const': 'Const',
'QuantizedConv2D': 'QuantizedConv2d_8x8to32',
'QuantizedMatMul': 'QuantizedMatMul_8x8to32',
'QuantizeDownAndShrinkRange': 'QuantizeDownAndShrinkRange_32to8',
'QuantizedRelu': 'QuantizedRelu_8',
'QuantizedReluX': 'QuantizedReluX_8',
'QuantizedMaxPool': 'QuantizedMaxPool_8',
'QuantizedAvgPool': 'QuantizedAvgPool_8',
'QuantizedConcat': 'QuantizedConcat_8',
'QuantizedBiasAdd': 'QuantizedBiasAdd_8p8to32',
'QuantizedResizeBilinear': 'QuantizedResizeBilinear_8',
'QuantizedSpaceToBatchND': 'QuantizedSpaceToBatchND_8',
'QuantizedBatchToSpaceND': 'QuantizedBatchToSpaceND_8',
'QuantizedSoftmax': 'QuantizedSoftmax_8',
'QuantizedTanh': 'QuantizedTanh_8',
'Min': 'Min_f',
'Max': 'Max_f',
'QuantizeV2': 'Quantize',
'Dequantize': 'Dequantize',
'Softmax': 'Softmax_f',
'Reshape': 'Reshape',
'QuantizedReshape': 'QuantizedReshape',
'Sigmoid': 'Sigmoid_f',
'Slice': 'Slice_f',
'Add': 'Add_f',
'Mul': 'Mul_f',
'Requantize': 'Requantize_32to8',
'RequantizationRange': 'RequantizationRange_32',
'Sub': 'Sub_f',
'Pack': 'Pack_int32',
'StridedSlice': 'StridedSlice_f',
'ExpandDims': 'ExpandDims_f',
'QuantizedMul': 'QuantizedMul_8x8to32',
'QuantizedAdd': 'QuantizedAdd_8p8to32',
'Pad': 'Pad_f',
'SpaceToBatchND': 'SpaceToBatchND_f',
'BatchToSpaceND': 'BatchToSpaceND_f',
'ResizeBilinear': 'ResizeBilinear_f',
'ConcatV2': 'ConcatV2_f',
'Conv2DBackpropInput': 'Deconv_f',
'Tanh': 'Tanh_f',
'Split': 'Split_f',
'Transpose': 'Transpose_f',
'Concat': 'Concat_f',
'AddN': 'AddN_f',
}
def has_op(self, tf_op):
return tf_op in self.dsp_ops
def map_nn_op(self, tf_op):
if tf_op not in self.dsp_ops:
raise Exception('Could not map nn op for: ', tf_op)
return self.dsp_ops[tf_op]
......@@ -4,77 +4,81 @@ import sys
import jinja2
# python encrypt_opencl_codegen.py --cl_kernel_dir=./mace/kernels/opencl/cl/ \
# python encrypt_opencl_codegen.py --cl_kernel_dir=./mace/kernels/opencl/cl/ \
# --output_path=./mace/codegen/opencl_encrypt/opencl_encrypted_program.cc
FLAGS = None
encrypt_lookup_table = "Xiaomi-AI-Platform-Mace"
def encrypt_code(code_str):
encrypted_arr = []
for i in range(len(code_str)):
encrypted_char = hex(ord(code_str[i]) ^ ord(encrypt_lookup_table[i % len(encrypt_lookup_table)]))
encrypted_arr.append(encrypted_char)
return encrypted_arr
encrypted_arr = []
for i in range(len(code_str)):
encrypted_char = hex(
ord(code_str[i]) ^ ord(
encrypt_lookup_table[i % len(encrypt_lookup_table)]))
encrypted_arr.append(encrypted_char)
return encrypted_arr
def main(unused_args):
if not os.path.exists(FLAGS.cl_kernel_dir):
print("Input cl_kernel_dir " + FLAGS.cl_kernel_dir + " doesn't exist!")
header_code = ""
for file_name in os.listdir(FLAGS.cl_kernel_dir):
file_path = os.path.join(FLAGS.cl_kernel_dir, file_name)
if file_path[-2:] == ".h":
f = open(file_path, "r")
header_code += f.read()
encrypted_code_maps = {}
for file_name in os.listdir(FLAGS.cl_kernel_dir):
file_path = os.path.join(FLAGS.cl_kernel_dir, file_name)
if file_path[-3:] == ".cl":
f = open(file_path, "r")
code_str = ""
for line in f.readlines():
if "#include <common.h>" in line:
code_str += header_code
else:
code_str += line
encrypted_code_arr = encrypt_code(code_str)
encrypted_code_maps[file_name[:-3]] = encrypted_code_arr
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
cpp_cl_encrypted_kernel = env.get_template('str2vec_maps.cc.jinja2').render(
maps=encrypted_code_maps,
data_type='unsigned char',
variable_name='kEncryptedProgramMap')
if os.path.isfile(FLAGS.output_path):
os.remove(FLAGS.output_path)
w_file = open(FLAGS.output_path, "w")
w_file.write(cpp_cl_encrypted_kernel)
w_file.close()
print("Generate encrypted opencl source done!")
if not os.path.exists(FLAGS.cl_kernel_dir):
print("Input cl_kernel_dir " + FLAGS.cl_kernel_dir + " doesn't exist!")
header_code = ""
for file_name in os.listdir(FLAGS.cl_kernel_dir):
file_path = os.path.join(FLAGS.cl_kernel_dir, file_name)
if file_path[-2:] == ".h":
f = open(file_path, "r")
header_code += f.read()
encrypted_code_maps = {}
for file_name in os.listdir(FLAGS.cl_kernel_dir):
file_path = os.path.join(FLAGS.cl_kernel_dir, file_name)
if file_path[-3:] == ".cl":
f = open(file_path, "r")
code_str = ""
for line in f.readlines():
if "#include <common.h>" in line:
code_str += header_code
else:
code_str += line
encrypted_code_arr = encrypt_code(code_str)
encrypted_code_maps[file_name[:-3]] = encrypted_code_arr
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
cpp_cl_encrypted_kernel = env.get_template(
'str2vec_maps.cc.jinja2').render(
maps=encrypted_code_maps,
data_type='unsigned char',
variable_name='kEncryptedProgramMap')
if os.path.isfile(FLAGS.output_path):
os.remove(FLAGS.output_path)
w_file = open(FLAGS.output_path, "w")
w_file.write(cpp_cl_encrypted_kernel)
w_file.close()
print("Generate encrypted opencl source done!")
def parse_args():
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--cl_kernel_dir",
type=str,
default="./mace/kernels/opencl/cl/",
help="The cl kernels directory.")
parser.add_argument(
"--output_path",
type=str,
default="./mace/examples/codegen/opencl/opencl_encrypted_program.cc",
help="The path of encrypted opencl kernels.")
return parser.parse_known_args()
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--cl_kernel_dir",
type=str,
default="./mace/kernels/opencl/cl/",
help="The cl kernels directory.")
parser.add_argument(
"--output_path",
type=str,
default="./mace/examples/codegen/opencl/opencl_encrypted_program.cc",
help="The path of encrypted opencl kernels.")
return parser.parse_known_args()
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
......@@ -2,18 +2,21 @@ import tensorflow as tf
from mace.proto import mace_pb2
from collections import OrderedDict
def sort_tf_node(node, nodes_map, ordered_nodes_map):
if node.name not in ordered_nodes_map:
for input_tensor_name in node.input:
input_node_name = input_tensor_name.split(':')[
0] if ':' in input_tensor_name else input_tensor_name
if input_node_name not in nodes_map or input_node_name in ordered_nodes_map:
if input_node_name not in nodes_map or \
input_node_name in ordered_nodes_map:
continue
input_node = nodes_map[input_node_name]
sort_tf_node(input_node, nodes_map, ordered_nodes_map)
ordered_nodes_map[node.name] = node
def sort_tf_graph(graph_def):
nodes_map = {}
ordered_nodes_map = OrderedDict()
......@@ -31,13 +34,15 @@ def sort_mace_node(node, nodes_map, ordered_nodes_map):
for input_tensor_name in node.input:
input_node_name = input_tensor_name.split(':')[
0] if ':' in input_tensor_name else input_tensor_name
if input_node_name not in nodes_map or input_node_name in ordered_nodes_map:
if input_node_name not in nodes_map or \
input_node_name in ordered_nodes_map:
continue
input_node = nodes_map[input_node_name]
sort_mace_node(input_node, nodes_map, ordered_nodes_map)
ordered_nodes_map[node.name] = node
def sort_mace_graph(graph_def, output_name):
nodes_map = {}
ordered_nodes_map = OrderedDict()
......
......@@ -2,120 +2,131 @@ import sys
import operator
from mace.proto import mace_pb2
class MemoryOptimizer(object):
def __init__(self, net_def):
self.net_def = net_def
self.idle_mem = set()
self.op_mem = {} # op_name->mem_id
self.mem_block = {} # mem_id->[x, y]
self.total_mem_count = 0
self.ref_counter = {}
consumers = {}
for op in net_def.op:
if self.is_buffer_image_op(op):
continue
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
# only ref op's output tensor
for op in net_def.op:
if self.is_buffer_image_op(op):
continue
for output in op.output:
tensor_name = output
if tensor_name in consumers:
self.ref_counter[tensor_name] = len(consumers[tensor_name])
def __init__(self, net_def):
self.net_def = net_def
self.idle_mem = set()
self.op_mem = {} # op_name->mem_id
self.mem_block = {} # mem_id->[x, y]
self.total_mem_count = 0
self.ref_counter = {}
consumers = {}
for op in net_def.op:
if self.is_buffer_image_op(op):
continue
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
# only ref op's output tensor
for op in net_def.op:
if self.is_buffer_image_op(op):
continue
for output in op.output:
tensor_name = output
if tensor_name in consumers:
self.ref_counter[tensor_name] = len(consumers[tensor_name])
else:
self.ref_counter[tensor_name] = 0
def is_buffer_image_op(self, op):
return op.type == 'BufferToImage' or op.type == 'ImageToBuffer'
def get_mem_size(self, op_type, output_shape):
mem_size = [0, 0]
if op_type == 'WinogradTransform' or op_type == 'MatMul':
mem_size[0] = output_shape[2] * output_shape[3]
mem_size[1] = output_shape[0] * int((output_shape[1] + 3) / 4)
else:
self.ref_counter[tensor_name] = 0
def is_buffer_image_op(self, op):
return op.type == 'BufferToImage' or op.type == 'ImageToBuffer'
def get_mem_size(self, op_type, output_shape):
mem_size = [0, 0]
if op_type == 'WinogradTransform' or op_type == 'MatMul':
mem_size[0] = output_shape[2] * output_shape[3]
mem_size[1] = output_shape[0] * int((output_shape[1]+3)/4)
else:
mem_size[0] = output_shape[2] * int((output_shape[3]+3)/4)
mem_size[1] = output_shape[0] * output_shape[1]
return mem_size
def mem_area(self, memory_size):
return memory_size[0] * memory_size[1]
def optimize(self):
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
if not op.output_shape:
print('WARNING: There is no output shape information to do memory optimization.')
return
if len(op.output_shape) != len(op.output):
print('WARNING: the number of output shape is not equal to the number of output.')
return
for i in range(len(op.output)):
op_mem_size = self.get_mem_size(op.type, op.output_shape[i].dims)
mem_id = -1
if len(self.idle_mem) > 0:
best_mem_candidate_id = -1
best_mem_candidate_delta_area = sys.maxint
best_mem_candidate_shape = []
for mid in self.idle_mem:
reuse_mem_size = self.mem_block[mid]
resize_mem_size = [max(reuse_mem_size[0], op_mem_size[0]), max(reuse_mem_size[1], op_mem_size[1])]
delta_mem_area = self.mem_area(resize_mem_size) - self.mem_area(reuse_mem_size)
if delta_mem_area < best_mem_candidate_delta_area:
best_mem_candidate_id = mid
best_mem_candidate_delta_area = delta_mem_area
best_mem_candidate_shape = resize_mem_size
if best_mem_candidate_delta_area <= self.mem_area(op_mem_size):
# reuse
self.mem_block[best_mem_candidate_id] = best_mem_candidate_shape
mem_id = best_mem_candidate_id
self.idle_mem.remove(mem_id)
if mem_id == -1:
mem_id = self.total_mem_count
self.total_mem_count += 1
self.mem_block[mem_id] = op_mem_size
op.mem_id.extend([mem_id])
self.op_mem[op.output[i]] = mem_id
# de-ref input tensor mem
for ipt in op.input:
if ipt in self.ref_counter:
self.ref_counter[ipt] -= 1
if self.ref_counter[ipt] == 0:
self.idle_mem.add(self.op_mem[ipt])
elif self.ref_counter[ipt] < 0:
raise Exception('ref count is less than 0')
for mem in self.mem_block:
arena = self.net_def.mem_arena
block = arena.mem_block.add()
block.mem_id = mem
block.x = self.mem_block[mem][0]
block.y = self.mem_block[mem][1]
print('total op: %d', len(self.net_def.op))
origin_mem_size = 0
optimized_mem_size = 0
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1)
for mem in self.mem_block:
print mem, self.mem_block[mem]
optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4)
print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)
mem_size[0] = output_shape[2] * int((output_shape[3] + 3) / 4)
mem_size[1] = output_shape[0] * output_shape[1]
return mem_size
def mem_area(self, memory_size):
return memory_size[0] * memory_size[1]
def optimize(self):
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
if not op.output_shape:
print('WARNING: There is no output shape information to '
'do memory optimization.')
return
if len(op.output_shape) != len(op.output):
print('WARNING: the number of output shape is not equal to '
'the number of output.')
return
for i in range(len(op.output)):
op_mem_size = self.get_mem_size(op.type,
op.output_shape[i].dims)
mem_id = -1
if len(self.idle_mem) > 0:
best_mem_candidate_id = -1
best_mem_candidate_delta_area = sys.maxint
best_mem_candidate_shape = []
for mid in self.idle_mem:
reuse_mem_size = self.mem_block[mid]
resize_mem_size = [
max(reuse_mem_size[0], op_mem_size[0]),
max(reuse_mem_size[1], op_mem_size[1])
]
delta_mem_area = self.mem_area(
resize_mem_size) - self.mem_area(reuse_mem_size)
if delta_mem_area < best_mem_candidate_delta_area:
best_mem_candidate_id = mid
best_mem_candidate_delta_area = delta_mem_area
best_mem_candidate_shape = resize_mem_size
if best_mem_candidate_delta_area <= self.mem_area(
op_mem_size):
# reuse
self.mem_block[
best_mem_candidate_id] = best_mem_candidate_shape
mem_id = best_mem_candidate_id
self.idle_mem.remove(mem_id)
if mem_id == -1:
mem_id = self.total_mem_count
self.total_mem_count += 1
self.mem_block[mem_id] = op_mem_size
op.mem_id.extend([mem_id])
self.op_mem[op.output[i]] = mem_id
# de-ref input tensor mem
for ipt in op.input:
if ipt in self.ref_counter:
self.ref_counter[ipt] -= 1
if self.ref_counter[ipt] == 0:
self.idle_mem.add(self.op_mem[ipt])
elif self.ref_counter[ipt] < 0:
raise Exception('ref count is less than 0')
for mem in self.mem_block:
arena = self.net_def.mem_arena
block = arena.mem_block.add()
block.mem_id = mem
block.x = self.mem_block[mem][0]
block.y = self.mem_block[mem][1]
print('total op: %d', len(self.net_def.op))
origin_mem_size = 0
optimized_mem_size = 0
for op in self.net_def.op:
if self.is_buffer_image_op(op):
continue
origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1)
for mem in self.mem_block:
print mem, self.mem_block[mem]
optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4)
print('origin mem: %d, optimized mem: %d', origin_mem_size,
optimized_mem_size)
def optimize_memory(net_def):
mem_optimizer = MemoryOptimizer(net_def)
mem_optimizer.optimize()
mem_optimizer = MemoryOptimizer(net_def)
mem_optimizer.optimize()
......@@ -14,86 +14,89 @@ FLAGS = None
def generate_cpp_source():
maps = {}
platform_info = ''
binary_dirs = FLAGS.cl_binary_dirs.strip().split(",")
for binary_dir in binary_dirs:
binary_path = os.path.join(binary_dir, FLAGS.built_kernel_file_name)
if not os.path.exists(binary_path):
continue
print 'generate opencl code from', binary_path
with open(binary_path, "rb") as f:
binary_array = np.fromfile(f, dtype=np.uint8)
idx = 0
size, = struct.unpack("Q", binary_array[idx:idx+8])
idx += 8
for _ in xrange(size):
key_size, = struct.unpack("i", binary_array[idx:idx+4])
idx += 4
key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size])
idx += key_size
value_size, = struct.unpack("i", binary_array[idx:idx+4])
idx += 4
maps[key] = []
value = struct.unpack(str(value_size) + "B",
binary_array[idx:idx+value_size])
idx += value_size
for ele in value:
maps[key].append(hex(ele))
cl_platform_info_path = os.path.join(binary_dir, FLAGS.platform_info_file_name)
with open(cl_platform_info_path, 'r') as f:
curr_platform_info = f.read()
if platform_info != "":
assert(curr_platform_info == platform_info)
platform_info = curr_platform_info
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
return env.get_template('opencl_compiled_kernel.cc.jinja2').render(
maps = maps,
data_type = 'unsigned char',
variable_name = 'kCompiledProgramMap',
platform_info = platform_info,
)
maps = {}
platform_info = ''
binary_dirs = FLAGS.cl_binary_dirs.strip().split(",")
for binary_dir in binary_dirs:
binary_path = os.path.join(binary_dir, FLAGS.built_kernel_file_name)
if not os.path.exists(binary_path):
continue
print 'generate opencl code from', binary_path
with open(binary_path, "rb") as f:
binary_array = np.fromfile(f, dtype=np.uint8)
idx = 0
size, = struct.unpack("Q", binary_array[idx:idx + 8])
idx += 8
for _ in xrange(size):
key_size, = struct.unpack("i", binary_array[idx:idx + 4])
idx += 4
key, = struct.unpack(
str(key_size) + "s", binary_array[idx:idx + key_size])
idx += key_size
value_size, = struct.unpack("i", binary_array[idx:idx + 4])
idx += 4
maps[key] = []
value = struct.unpack(
str(value_size) + "B", binary_array[idx:idx + value_size])
idx += value_size
for ele in value:
maps[key].append(hex(ele))
cl_platform_info_path = os.path.join(binary_dir,
FLAGS.platform_info_file_name)
with open(cl_platform_info_path, 'r') as f:
curr_platform_info = f.read()
if platform_info != "":
assert (curr_platform_info == platform_info)
platform_info = curr_platform_info
env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
return env.get_template('opencl_compiled_kernel.cc.jinja2').render(
maps=maps,
data_type='unsigned char',
variable_name='kCompiledProgramMap',
platform_info=platform_info,
)
def main(unused_args):
cpp_cl_binary_source = generate_cpp_source()
if os.path.isfile(FLAGS.output_path):
os.remove(FLAGS.output_path)
w_file = open(FLAGS.output_path, "w")
w_file.write(cpp_cl_binary_source)
w_file.close()
cpp_cl_binary_source = generate_cpp_source()
if os.path.isfile(FLAGS.output_path):
os.remove(FLAGS.output_path)
w_file = open(FLAGS.output_path, "w")
w_file.write(cpp_cl_binary_source)
w_file.close()
def parse_args():
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--cl_binary_dirs",
type=str,
default="",
help="The cl binaries directories.")
parser.add_argument(
"--built_kernel_file_name",
type=str,
default="",
help="The cl binaries directories.")
parser.add_argument(
"--platform_info_file_name",
type=str,
default="",
help="The cl binaries directories.")
parser.add_argument(
"--output_path",
type=str,
default="./mace/examples/codegen/opencl/opencl_compiled_program.cc",
help="The path of generated C++ header file which contains cl binaries.")
return parser.parse_known_args()
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--cl_binary_dirs",
type=str,
default="",
help="The cl binaries directories.")
parser.add_argument(
"--built_kernel_file_name",
type=str,
default="",
help="The cl binaries directories.")
parser.add_argument(
"--platform_info_file_name",
type=str,
default="",
help="The cl binaries directories.")
parser.add_argument(
"--output_path",
type=str,
default="./mace/examples/codegen/opencl/opencl_compiled_program.cc",
help="The path of generated C++ header file for cl binaries.")
return parser.parse_known_args()
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
......@@ -6,182 +6,196 @@ import hashlib
from mace.proto import mace_pb2
from jinja2 import Environment, FileSystemLoader
GENERATED_NAME = set()
def generate_obfuscated_name(namespace, name):
md5 = hashlib.md5()
md5.update(namespace)
md5.update(name)
md5_digest = md5.hexdigest()
name = md5_digest[:8]
while name in GENERATED_NAME:
name = md5_digest
assert name not in GENERATED_NAME
GENERATED_NAME.add(name)
return name
md5 = hashlib.md5()
md5.update(namespace)
md5.update(name)
md5_digest = md5.hexdigest()
name = md5_digest[:8]
while name in GENERATED_NAME:
name = md5_digest
assert name not in GENERATED_NAME
GENERATED_NAME.add(name)
return name
def generate_tensor_map(tensors):
tensor_map = {}
for t in tensors:
if not tensor_map.has_key(t.name):
tensor_map[t.name] = generate_obfuscated_name("tensor", t.name)
return tensor_map
tensor_map = {}
for t in tensors:
if t.name not in tensor_map:
tensor_map[t.name] = generate_obfuscated_name("tensor", t.name)
return tensor_map
def generate_in_out_map(ops, tensor_map):
in_out_map = {}
for op in ops:
op.name = generate_obfuscated_name("op", op.name)
for input_name in op.input:
if not in_out_map.has_key(input_name):
if tensor_map.has_key(input_name):
in_out_map[input_name] = tensor_map[input_name]
else:
in_out_map[input_name] = generate_obfuscated_name("in", input_name)
for output_name in op.output:
if not in_out_map.has_key(output_name):
if tensor_map.has_key(output_name):
in_out_map[output_name] = tensor_map[output_name]
else:
in_out_map[output_name] = generate_obfuscated_name("out", output_name)
return in_out_map
in_out_map = {}
for op in ops:
op.name = generate_obfuscated_name("op", op.name)
for input_name in op.input:
if input_name not in in_out_map:
if input_name in tensor_map:
in_out_map[input_name] = tensor_map[input_name]
else:
in_out_map[input_name] = generate_obfuscated_name(
"in", input_name)
for output_name in op.output:
if output_name not in in_out_map:
if output_name in tensor_map:
in_out_map[output_name] = tensor_map[output_name]
else:
in_out_map[output_name] = generate_obfuscated_name(
"out", output_name)
return in_out_map
def obfuscate_name(net_def):
input_node = "mace_input_node"
output_node = "mace_output_node"
tensor_map = generate_tensor_map(net_def.tensors)
in_out_map = generate_in_out_map(net_def.op, tensor_map)
for t in net_def.tensors:
if input_node not in t.name and output_node not in t.name:
t.name = tensor_map[t.name]
for op in net_def.op:
for i in range(len(op.input)):
if input_node not in op.input[i]:
op.input[i] = in_out_map[op.input[i]]
for i in range(len(op.output)):
if output_node not in op.output[i]:
op.output[i] = in_out_map[op.output[i]]
input_node = "mace_input_node"
output_node = "mace_output_node"
tensor_map = generate_tensor_map(net_def.tensors)
in_out_map = generate_in_out_map(net_def.op, tensor_map)
for t in net_def.tensors:
if input_node not in t.name and output_node not in t.name:
t.name = tensor_map[t.name]
for op in net_def.op:
for i in range(len(op.input)):
if input_node not in op.input[i]:
op.input[i] = in_out_map[op.input[i]]
for i in range(len(op.output)):
if output_node not in op.output[i]:
op.output[i] = in_out_map[op.output[i]]
def rename_tensor(net_def):
tensor_map = {}
for t in net_def.tensors:
if not tensor_map.has_key(t.name):
tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_")
t.name = tensor_map[t.name]
for op in net_def.op:
for i in range(len(op.input)):
if tensor_map.has_key(op.input[i]):
op.input[i] = tensor_map[op.input[i]]
for i in range(len(op.output)):
if tensor_map.has_key(op.output[i]):
op.output[i] = tensor_map[op.output[i]]
tensor_map = {}
for t in net_def.tensors:
if t.name not in tensor_map:
tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_")
t.name = tensor_map[t.name]
for op in net_def.op:
for i in range(len(op.input)):
if op.input[i] in tensor_map:
op.input[i] = tensor_map[op.input[i]]
for i in range(len(op.output)):
if op.output[i] in tensor_map:
op.output[i] = tensor_map[op.output[i]]
class TensorInfo:
def __init__(self, id, t, runtime):
self.id = id
self.data_type = mace_pb2.DataType.Name(t.data_type)
if t.data_type == mace_pb2.DT_FLOAT:
if runtime == 'gpu':
self.data_type = mace_pb2.DT_HALF
self.data = bytearray(np.array(t.float_data).astype(np.float16).tobytes())
else:
self.data_type = mace_pb2.DT_FLOAT
self.data = bytearray(np.array(t.float_data).astype(np.float32).tobytes())
elif t.data_type == mace_pb2.DT_INT32:
self.data = bytearray(np.array(t.int32_data).astype(np.int32).tobytes())
elif t.data_type == mace_pb2.DT_UINT8:
self.data = bytearray(np.array(t.int32_data).astype(np.uint8).tolist())
def __init__(self, id, t, runtime):
self.id = id
self.data_type = mace_pb2.DataType.Name(t.data_type)
if t.data_type == mace_pb2.DT_FLOAT:
if runtime == 'gpu':
self.data_type = mace_pb2.DT_HALF
self.data = bytearray(
np.array(t.float_data).astype(np.float16).tobytes())
else:
self.data_type = mace_pb2.DT_FLOAT
self.data = bytearray(
np.array(t.float_data).astype(np.float32).tobytes())
elif t.data_type == mace_pb2.DT_INT32:
self.data = bytearray(
np.array(t.int32_data).astype(np.int32).tobytes())
elif t.data_type == mace_pb2.DT_UINT8:
self.data = bytearray(
np.array(t.int32_data).astype(np.uint8).tolist())
def stringfy(value):
return ', '.join('"{0}"'.format(w) for w in value)
def convert_to_source(net_def, mode_pb_checksum, template_dir, obfuscate, model_tag, output, runtime, embed_model_data):
if obfuscate:
obfuscate_name(net_def)
else:
rename_tensor(net_def)
# Capture our current directory
print template_dir
# Create the jinja2 environment.
j2_env = Environment(loader=FileSystemLoader(template_dir), trim_blocks=True)
j2_env.filters['stringfy'] = stringfy
output_dir = os.path.dirname(output) + '/'
# generate tensor source files
template_name = 'tensor_source.jinja2'
model_data = []
offset = 0
counter = 0
for t in net_def.tensors:
tensor_info = TensorInfo(counter, t, runtime)
# align
if tensor_info.data_type != 'DT_UINT8' and offset % 4 != 0:
padding = 4 - offset % 4
model_data.extend(bytearray([0] * padding))
offset += padding
return ', '.join('"{0}"'.format(w) for w in value)
def convert_to_source(net_def, mode_pb_checksum, template_dir, obfuscate,
model_tag, output, runtime, embed_model_data):
if obfuscate:
obfuscate_name(net_def)
else:
rename_tensor(net_def)
# Capture our current directory
print template_dir
# Create the jinja2 environment.
j2_env = Environment(
loader=FileSystemLoader(template_dir), trim_blocks=True)
j2_env.filters['stringfy'] = stringfy
output_dir = os.path.dirname(output) + '/'
# generate tensor source files
template_name = 'tensor_source.jinja2'
model_data = []
offset = 0
counter = 0
for t in net_def.tensors:
tensor_info = TensorInfo(counter, t, runtime)
# align
if tensor_info.data_type != 'DT_UINT8' and offset % 4 != 0:
padding = 4 - offset % 4
model_data.extend(bytearray([0] * padding))
offset += padding
source = j2_env.get_template(template_name).render(
tensor_info=tensor_info,
tensor=t,
tag=model_tag,
runtime=runtime,
offset=offset,
)
model_data.extend(tensor_info.data)
offset += len(tensor_info.data)
with open(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f:
f.write(source)
counter += 1
# generate tensor data
template_name = 'tensor_data.jinja2'
source = j2_env.get_template(template_name).render(
tensor_info = tensor_info,
tensor = t,
tag = model_tag,
runtime = runtime,
offset = offset,
)
model_data.extend(tensor_info.data)
offset += len(tensor_info.data)
with open(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f:
f.write(source)
counter += 1
# generate tensor data
template_name = 'tensor_data.jinja2'
source = j2_env.get_template(template_name).render(
tag = model_tag,
embed_model_data = embed_model_data,
model_data_size = offset,
model_data = model_data
)
with open(output_dir + 'tensor_data' + '.cc', "wb") as f:
f.write(source)
if not embed_model_data:
f = open(output_dir + model_tag + '.data', "wb")
f.write(bytearray(model_data))
f.close()
# generate op source files
template_name = 'operator.jinja2'
counter = 0
op_size = len(net_def.op)
for start in range(0, op_size, 10):
tag=model_tag,
embed_model_data=embed_model_data,
model_data_size=offset,
model_data=model_data)
with open(output_dir + 'tensor_data' + '.cc', "wb") as f:
f.write(source)
if not embed_model_data:
f = open(output_dir + model_tag + '.data', "wb")
f.write(bytearray(model_data))
f.close()
# generate op source files
template_name = 'operator.jinja2'
counter = 0
op_size = len(net_def.op)
for start in range(0, op_size, 10):
source = j2_env.get_template(template_name).render(
start=start,
end=min(start + 10, op_size),
net=net_def,
tag=model_tag,
runtime=runtime,
)
with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f:
f.write(source)
counter += 1
# generate model source files
template_name = 'model.jinja2'
tensors = [
TensorInfo(i, net_def.tensors[i], runtime)
for i in range(len(net_def.tensors))
]
source = j2_env.get_template(template_name).render(
start = start,
end = min(start+10, op_size),
net = net_def,
tag = model_tag,
runtime = runtime,
)
with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f:
f.write(source)
counter += 1
# generate model source files
template_name = 'model.jinja2'
tensors = [TensorInfo(i, net_def.tensors[i], runtime) for i in range(len(net_def.tensors))]
source = j2_env.get_template(template_name).render(
tensors = tensors,
net = net_def,
tag = model_tag,
runtime = runtime,
model_pb_checksum = mode_pb_checksum
)
with open(output, "wb") as f:
f.write(source)
# generate model header file
template_name = 'model_header.jinja2'
source = j2_env.get_template(template_name).render(
tag = model_tag,
)
with open(output_dir + model_tag + '.h', "wb") as f:
f.write(source)
tensors=tensors,
net=net_def,
tag=model_tag,
runtime=runtime,
model_pb_checksum=mode_pb_checksum)
with open(output, "wb") as f:
f.write(source)
# generate model header file
template_name = 'model_header.jinja2'
source = j2_env.get_template(template_name).render(tag=model_tag, )
with open(output_dir + model_tag + '.h', "wb") as f:
f.write(source)
......@@ -8,51 +8,41 @@ from mace.python.tools import memory_optimizer
from tensorflow.core.framework import graph_pb2
from tensorflow.core.framework import tensor_shape_pb2
padding_mode = {
'VALID': 0,
'SAME': 1,
'FULL': 2
}
pooling_type_mode = {
'AvgPool': 1,
'MaxPool': 2
}
padding_mode = {'VALID': 0, 'SAME': 1, 'FULL': 2}
pooling_type_mode = {'AvgPool': 1, 'MaxPool': 2}
# the order should be the same as
# eltwise type's in mace/kernels/eltwise.h
# and also cwise type's in mace/kernels/cwise.h
# cuz these math ops should have compatible with "EltWise" and "CWise"
math_type_mode = {
'MUL': 0,
'ADD': 1,
'MAX': 2,
'MIN': 3,
'SUB': 4,
'DIV': 5,
'NEG': 6,
'ABS': 7
'MUL': 0,
'ADD': 1,
'MAX': 2,
'MIN': 3,
'SUB': 4,
'DIV': 5,
'NEG': 6,
'ABS': 7
}
buffer_type_map = {
'CONV2D_FILTER' : 0,
'IN_OUT_CHANNEL' : 1,
'ARGUMENT' : 2,
'IN_OUT_HEIGHT' : 3,
'IN_OUT_WIDTH' : 4,
'WINOGRAD_FILTER' : 5,
'DW_CONV2D_FILTER' : 6,
'CONV2D_FILTER': 0,
'IN_OUT_CHANNEL': 1,
'ARGUMENT': 2,
'IN_OUT_HEIGHT': 3,
'IN_OUT_WIDTH': 4,
'WINOGRAD_FILTER': 5,
'DW_CONV2D_FILTER': 6,
}
data_type_map = {
'DT_HALF' : mace_pb2.DT_HALF,
'DT_FLOAT': mace_pb2.DT_FLOAT
}
data_type_map = {'DT_HALF': mace_pb2.DT_HALF, 'DT_FLOAT': mace_pb2.DT_FLOAT}
activation_name_map = {
'Relu' : 'RELU',
'Sigmoid' : 'SIGMOID',
'Tanh' : 'TANH',
'Relu6' : 'RELUX'
'Relu': 'RELU',
'Sigmoid': 'SIGMOID',
'Tanh': 'TANH',
'Relu6': 'RELUX'
}
BATCH_NORM_ORDER = ["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"]
......@@ -62,1123 +52,1170 @@ MACE_OUTPUT_NODE_NAME = "mace_output_node"
OPENCL_IMAGE_MAX_SIZE = 16384
def get_input_tensor(op, index):
input_tensor = op.inputs[index]
if input_tensor.op.type == 'Reshape':
input_tensor = get_input_tensor(input_tensor.op, 0)
return input_tensor
input_tensor = op.inputs[index]
if input_tensor.op.type == 'Reshape':
input_tensor = get_input_tensor(input_tensor.op, 0)
return input_tensor
class TFConverter(object):
def __init__(self, tf_ops, net_def, dt, device, winograd):
self.net_def = net_def
self.tf_ops = tf_ops
self.dt = dt
self.device = device
self.winograd = winograd
self.tf_graph = {}
self.tf_parents = {}
self.resolved_ops = {}
self.unused_tensor = set()
self.transpose_filter_tensor = {}
self.reshape_tensor = {}
self.ops = {}
for op in tf_ops:
self.ops[op.name] = op
for op in tf_ops:
self.resolved_ops[op.name] = 0
for input in op.inputs:
input_name = input.name[:-2]
if input_name not in self.tf_graph:
self.tf_graph[input_name] = []
self.tf_graph[input_name].append(op)
if op.name not in self.tf_parents:
self.tf_parents[op.name] = []
self.tf_parents[op.name].append(self.ops[input_name])
def add_buffer_to_image(self, input_name, input_type):
output_name = input_name[:-2] + "_b2i" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'BufferToImage'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'mode'
arg.i = 0
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_image_to_buffer(self, input_name, input_type):
output_name = input_name[:-2] + "_i2b" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_gpu_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'BufferToImage'
op_def.input.extend([new_input_name])
op_def.output.extend([name+':0'])
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
def add_neon_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'Transpose'
op_def.input.extend([new_input_name])
op_def.output.extend([name+':0'])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 3, 1, 2])
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
def add_gpu_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([name+':0'])
op_def.output.extend([output_name])
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
def add_neon_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'Transpose'
op_def.input.extend([name+':0'])
op_def.output.extend([output_name])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 2, 3, 1])
@staticmethod
def add_output_shape(outputs, op):
output_shapes = []
for output in outputs:
output_shape = mace_pb2.OutputShape()
if isinstance(output, list):
output_shape.dims.extend(output)
elif isinstance(output, tf.Tensor):
if output.shape.num_elements() is not None:
output_shape.dims.extend(output.shape.as_list())
else:
raise ValueError('output type not supported: ', type(output))
output_shapes.append(output_shape)
op.output_shape.extend(output_shapes)
def add_tensor(self, name, shape, tf_dt, value):
tensor = self.net_def.tensors.add()
tensor.name = name
shape = list(shape)
tensor.dims.extend(shape)
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(value.flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(value.flat)
else:
raise Exception("Not supported tensor type: " + tf_dt.name)
def convert_reshape(self, op):
input_tensor = get_input_tensor(op, 0)
shape_tensor = get_input_tensor(op, 1)
shape_value = shape_tensor.eval().astype(np.int32)
self.unused_tensor.add(shape_tensor.name)
self.reshape_tensor[input_tensor.name] = shape_value
self.resolved_ops[op.name] = 1
def convert_tensor(self, op):
output_name = op.outputs[0].name
if output_name not in self.unused_tensor:
tensor = self.net_def.tensors.add()
tf_tensor = op.outputs[0].eval()
if output_name in self.transpose_filter_tensor:
tf_tensor = tf_tensor.transpose(self.transpose_filter_tensor[output_name])
if output_name in self.reshape_tensor:
tf_tensor = tf_tensor.reshape(self.reshape_tensor[output_name])
tensor.name = op.outputs[0].name
shape = list(tf_tensor.shape)
tensor.dims.extend(shape)
tf_dt = op.get_attr('dtype')
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
else:
raise Exception("Not supported tensor type: " + tf_dt.name)
self.resolved_ops[op.name] = 1
def check_winograd_conv(self, op):
filter_shape = get_input_tensor(op, 1).shape.as_list()
strides = op.get_attr('strides')[1:3]
output_shape = op.outputs[0].shape.as_list()
if len(output_shape) == 0 or output_shape[0] is None:
return False
width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
return self.winograd and op.type != 'DepthwiseConv2dNative' and self.device == 'gpu' and \
filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
def __init__(self, tf_ops, net_def, dt, device, winograd):
self.net_def = net_def
self.tf_ops = tf_ops
self.dt = dt
self.device = device
self.winograd = winograd
self.tf_graph = {}
self.tf_parents = {}
self.resolved_ops = {}
self.unused_tensor = set()
self.transpose_filter_tensor = {}
self.reshape_tensor = {}
self.ops = {}
for op in tf_ops:
self.ops[op.name] = op
for op in tf_ops:
self.resolved_ops[op.name] = 0
for input in op.inputs:
input_name = input.name[:-2]
if input_name not in self.tf_graph:
self.tf_graph[input_name] = []
self.tf_graph[input_name].append(op)
if op.name not in self.tf_parents:
self.tf_parents[op.name] = []
self.tf_parents[op.name].append(self.ops[input_name])
def add_buffer_to_image(self, input_name, input_type):
output_name = input_name[:-2] + "_b2i" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'BufferToImage'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'mode'
arg.i = 0
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_image_to_buffer(self, input_name, input_type):
output_name = input_name[:-2] + "_i2b" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_gpu_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'BufferToImage'
op_def.input.extend([new_input_name])
op_def.output.extend([name + ':0'])
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
def add_neon_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'Transpose'
op_def.input.extend([new_input_name])
op_def.output.extend([name + ':0'])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 3, 1, 2])
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
def add_gpu_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([name + ':0'])
op_def.output.extend([output_name])
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
def add_neon_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'Transpose'
op_def.input.extend([name + ':0'])
op_def.output.extend([output_name])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 2, 3, 1])
@staticmethod
def add_output_shape(outputs, op):
output_shapes = []
for output in outputs:
output_shape = mace_pb2.OutputShape()
if isinstance(output, list):
output_shape.dims.extend(output)
elif isinstance(output, tf.Tensor):
if output.shape.num_elements() is not None:
output_shape.dims.extend(output.shape.as_list())
else:
raise ValueError('output type not supported: ', type(output))
output_shapes.append(output_shape)
op.output_shape.extend(output_shapes)
def add_tensor(self, name, shape, tf_dt, value):
tensor = self.net_def.tensors.add()
tensor.name = name
shape = list(shape)
tensor.dims.extend(shape)
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(value.flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(value.flat)
else:
raise Exception("Not supported tensor type: " + tf_dt.name)
def convert_reshape(self, op):
input_tensor = get_input_tensor(op, 0)
shape_tensor = get_input_tensor(op, 1)
shape_value = shape_tensor.eval().astype(np.int32)
self.unused_tensor.add(shape_tensor.name)
self.reshape_tensor[input_tensor.name] = shape_value
self.resolved_ops[op.name] = 1
def convert_tensor(self, op):
output_name = op.outputs[0].name
if output_name not in self.unused_tensor:
tensor = self.net_def.tensors.add()
tf_tensor = op.outputs[0].eval()
if output_name in self.transpose_filter_tensor:
tf_tensor = tf_tensor.transpose(
self.transpose_filter_tensor[output_name])
if output_name in self.reshape_tensor:
tf_tensor = tf_tensor.reshape(self.reshape_tensor[output_name])
tensor.name = op.outputs[0].name
shape = list(tf_tensor.shape)
tensor.dims.extend(shape)
tf_dt = op.get_attr('dtype')
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
else:
raise Exception("Not supported tensor type: " + tf_dt.name)
self.resolved_ops[op.name] = 1
def check_winograd_conv(self, op):
filter_shape = get_input_tensor(op, 1).shape.as_list()
strides = op.get_attr('strides')[1:3]
output_shape = op.outputs[0].shape.as_list()
if len(output_shape) == 0 or output_shape[0] is None:
return False
width = output_shape[0] * ((output_shape[1] + 1) / 2) * ((
output_shape[2] + 1) / 2)
return self.winograd and op.type != 'DepthwiseConv2dNative' and \
self.device == 'gpu' and filter_shape[0] == 3 and \
(filter_shape[0] == filter_shape[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]) and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
def convert_winograd_conv(self, op):
filter_tensor = get_input_tensor(op, 1)
filter_shape = filter_tensor.shape.as_list()
output_shape = op.outputs[0].shape.as_list()
self.transpose_filter_tensor[filter_tensor.name] = (3, 2, 0, 1)
filter_name = self.add_buffer_to_image(op.inputs[1].name, "WINOGRAD_FILTER")
# Input transform
wt_op = mace_pb2.OperatorDef()
arg = wt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
padding_arg = wt_op.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
wt_op.name = op.name + '_input_transform'
wt_op.type = 'WinogradTransform'
wt_op.input.extend([op.inputs[0].name])
wt_output_name = wt_op.name + ":0"
wt_op.output.extend([wt_output_name])
wt_output_shape = mace_pb2.OutputShape()
wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
wt_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
wt_op.output_shape.extend([wt_output_shape])
# MatMul
matmul_op = mace_pb2.OperatorDef()
arg = matmul_op.arg.add()
arg.name = 'T'
arg.i = self.dt
matmul_op.name = op.name + '_matmul'
matmul_op.type = 'MatMul'
matmul_op.input.extend([filter_name, wt_output_name])
matmul_output_name = matmul_op.name + ":0"
matmul_op.output.extend([matmul_output_name])
matmul_output_shape = mace_pb2.OutputShape()
matmul_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1])
matmul_op.output_shape.extend([matmul_output_shape])
# Inverse transform
iwt_op = mace_pb2.OperatorDef()
arg = iwt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
batch_arg = iwt_op.arg.add()
batch_arg.name = 'batch'
batch_arg.i = output_shape[0]
height_arg = iwt_op.arg.add()
height_arg.name = 'height'
height_arg.i = output_shape[1]
width_arg = iwt_op.arg.add()
width_arg.name = 'width'
width_arg.i = output_shape[2]
iwt_op.name = op.name + '_inverse_transform'
iwt_op.type = 'WinogradInverseTransform'
iwt_op.input.extend([matmul_output_name])
final_op = op
self.resolved_ops[op.name] = 1
if len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd' :
bias_add_op = self.tf_graph[op.name][0]
output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
iwt_op.input.extend([output_name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 \
and self.tf_graph[final_op.name][0].type in activation_name_map:
activation_op = self.tf_graph[final_op.name][0]
fused_act_arg = iwt_op.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
max_limit_arg = iwt_op.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
iwt_op.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, iwt_op)
self.net_def.op.extend([wt_op, matmul_op, iwt_op])
def convert_conv2d(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
if op.type == 'DepthwiseConv2dNative':
op_def.type = 'DepthwiseConv2d'
if self.device == 'neon':
self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (3, 2, 0, 1)
else:
op_def.type = op.type
if self.device == 'neon':
self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (3, 2, 0, 1)
else:
self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (0, 1, 3, 2)
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
buffer_type = "DW_CONV2D_FILTER" if op_def.type == 'DepthwiseConv2d' else "CONV2D_FILTER"
output_name = self.add_buffer_to_image(get_input_tensor(op, 1).name, buffer_type)
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(op, i).name for i in range(len(op.inputs))])
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(op.get_attr('strides')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
final_op = op
self.resolved_ops[op.name] = 1
if len(self.tf_graph.get(op.name, [])) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd':
bias_add_op = self.tf_graph[op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph.get(final_op.name, [])) == 1 \
and self.tf_graph[final_op.name][0].type in activation_name_map:
activation_op = self.tf_graph[final_op.name][0]
if op_def.type == "Conv2D":
op_def.type = "FusedConv2D"
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
def convert_fused_batchnorm(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
op_def.name = op.name
op_def.type = 'FoldedBatchNorm'
gamma_tensor = get_input_tensor(op, 1)
for i in range(1, 5):
input_tensor = get_input_tensor(op, i)
assert input_tensor.shape == gamma_tensor.shape
self.unused_tensor.add(input_tensor.name)
gamma_value = get_input_tensor(op, 1).eval().astype(np.float32)
beta_value = get_input_tensor(op, 2).eval().astype(np.float32)
mean_value = get_input_tensor(op, 3).eval().astype(np.float32)
var_value = get_input_tensor(op, 4).eval().astype(np.float32)
epsilon_value = op.get_attr('epsilon')
scale_value = (
(1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
gamma_value)
offset_value = (-mean_value * scale_value) + beta_value
idx = gamma_tensor.name.rfind('/')
name_prefix = gamma_tensor.name[:idx] + '/'
input_names = [name_prefix+'scale:0', name_prefix+'offset:0']
self.add_tensor(input_names[0], gamma_value.shape,
gamma_tensor.dtype, scale_value)
self.add_tensor(input_names[1], gamma_value.shape,
gamma_tensor.dtype, offset_value)
op_def.input.extend([op.inputs[0].name])
if self.device == 'gpu':
for name in input_names:
output_name = self.add_buffer_to_image(name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([name for name in input_names])
self.resolved_ops[op.name] = 1
final_op = op
if len(self.tf_graph[op.name]) == 1 \
and self.tf_graph[op.name][0].type in activation_name_map:
activation_op = self.tf_graph[op.name][0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
def convert_winograd_conv(self, op):
filter_tensor = get_input_tensor(op, 1)
filter_shape = filter_tensor.shape.as_list()
output_shape = op.outputs[0].shape.as_list()
self.transpose_filter_tensor[filter_tensor.name] = (3, 2, 0, 1)
filter_name = self.add_buffer_to_image(op.inputs[1].name,
"WINOGRAD_FILTER")
# Input transform
wt_op = mace_pb2.OperatorDef()
arg = wt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
padding_arg = wt_op.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
wt_op.name = op.name + '_input_transform'
wt_op.type = 'WinogradTransform'
wt_op.input.extend([op.inputs[0].name])
wt_output_name = wt_op.name + ":0"
wt_op.output.extend([wt_output_name])
wt_output_shape = mace_pb2.OutputShape()
wt_output_width = output_shape[0] * ((output_shape[1] + 1) / 2) * ((
output_shape[2] + 1) / 2)
wt_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
wt_op.output_shape.extend([wt_output_shape])
# MatMul
matmul_op = mace_pb2.OperatorDef()
arg = matmul_op.arg.add()
arg.name = 'T'
arg.i = self.dt
matmul_op.name = op.name + '_matmul'
matmul_op.type = 'MatMul'
matmul_op.input.extend([filter_name, wt_output_name])
matmul_output_name = matmul_op.name + ":0"
matmul_op.output.extend([matmul_output_name])
matmul_output_shape = mace_pb2.OutputShape()
matmul_output_shape.dims.extend(
[16, filter_shape[3], wt_output_width, 1])
matmul_op.output_shape.extend([matmul_output_shape])
# Inverse transform
iwt_op = mace_pb2.OperatorDef()
arg = iwt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
batch_arg = iwt_op.arg.add()
batch_arg.name = 'batch'
batch_arg.i = output_shape[0]
height_arg = iwt_op.arg.add()
height_arg.name = 'height'
height_arg.i = output_shape[1]
width_arg = iwt_op.arg.add()
width_arg.name = 'width'
width_arg.i = output_shape[2]
iwt_op.name = op.name + '_inverse_transform'
iwt_op.type = 'WinogradInverseTransform'
iwt_op.input.extend([matmul_output_name])
final_op = op
self.resolved_ops[op.name] = 1
if len(self.tf_graph[op.name]
) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd':
bias_add_op = self.tf_graph[op.name][0]
output_name = self.add_buffer_to_image(
get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
iwt_op.input.extend([output_name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 and \
self.tf_graph[final_op.name][0].type in activation_name_map:
activation_op = self.tf_graph[final_op.name][0]
fused_act_arg = iwt_op.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
max_limit_arg = iwt_op.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
iwt_op.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, iwt_op)
self.net_def.op.extend([wt_op, matmul_op, iwt_op])
def convert_conv2d(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
if op.type == 'DepthwiseConv2dNative':
op_def.type = 'DepthwiseConv2d'
if self.device == 'neon':
self.transpose_filter_tensor[get_input_tensor(
op, 1).name] = (3, 2, 0, 1)
else:
op_def.type = op.type
if self.device == 'neon':
self.transpose_filter_tensor[get_input_tensor(
op, 1).name] = (3, 2, 0, 1)
else:
self.transpose_filter_tensor[get_input_tensor(
op, 1).name] = (0, 1, 3, 2)
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
if op_def.type == 'DepthwiseConv2d':
buffer_type = "DW_CONV2D_FILTER"
else:
buffer_type = "CONV2D_FILTER"
output_name = self.add_buffer_to_image(
get_input_tensor(op, 1).name, buffer_type)
op_def.input.extend([output_name])
else:
op_def.input.extend(
[get_input_tensor(op, i).name for i in range(len(op.inputs))])
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(op.get_attr('strides')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
final_op = op
self.resolved_ops[op.name] = 1
if len(self.tf_graph.get(op.name, [])) == 1 and \
self.tf_graph[op.name][0].type == 'BiasAdd':
bias_add_op = self.tf_graph[op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(
get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph.get(final_op.name, [])) == 1 and \
self.tf_graph[final_op.name][0].type in activation_name_map:
activation_op = self.tf_graph[final_op.name][0]
if op_def.type == "Conv2D":
op_def.type = "FusedConv2D"
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
def convert_fused_batchnorm(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
op_def.name = op.name
op_def.type = 'FoldedBatchNorm'
gamma_tensor = get_input_tensor(op, 1)
for i in range(1, 5):
input_tensor = get_input_tensor(op, i)
assert input_tensor.shape == gamma_tensor.shape
self.unused_tensor.add(input_tensor.name)
gamma_value = get_input_tensor(op, 1).eval().astype(np.float32)
beta_value = get_input_tensor(op, 2).eval().astype(np.float32)
mean_value = get_input_tensor(op, 3).eval().astype(np.float32)
var_value = get_input_tensor(op, 4).eval().astype(np.float32)
epsilon_value = op.get_attr('epsilon')
scale_value = ((1.0 / np.vectorize(math.sqrt)
(var_value + epsilon_value)) * gamma_value)
offset_value = (-mean_value * scale_value) + beta_value
idx = gamma_tensor.name.rfind('/')
name_prefix = gamma_tensor.name[:idx] + '/'
input_names = [name_prefix + 'scale:0', name_prefix + 'offset:0']
self.add_tensor(input_names[0], gamma_value.shape, gamma_tensor.dtype,
scale_value)
self.add_tensor(input_names[1], gamma_value.shape, gamma_tensor.dtype,
offset_value)
op_def.input.extend([op.inputs[0].name])
if self.device == 'gpu':
for name in input_names:
output_name = self.add_buffer_to_image(name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([name for name in input_names])
self.resolved_ops[op.name] = 1
final_op = op
if len(self.tf_graph[op.name]) == 1 \
and self.tf_graph[op.name][0].type in activation_name_map:
activation_op = self.tf_graph[op.name][0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
op_def.output.extend([final_op.outputs[0].name])
self.add_output_shape([final_op.outputs[0]], op_def)
self.net_def.op.extend([op_def])
def convert_batchnorm(self, op):
bn_ops = []
bn_ops.append(op)
for i in range(1, 3):
if len(self.tf_graph[bn_ops[i-1].name]) == 1 and \
self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]:
bn_ops.append(self.tf_graph[bn_ops[i - 1].name][0])
else:
raise Exception('Invalid BatchNorm Op')
if len(self.tf_graph[bn_ops[2].name]) == 2 and \
self.tf_graph[bn_ops[2].name][0].type == \
BATCH_NORM_ORDER[3] and \
self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]:
bn_ops.append(self.tf_graph[bn_ops[2].name][0])
bn_ops.append(self.tf_graph[bn_ops[2].name][1])
else:
raise Exception('Invalid BatchNorm Op')
bn_ops.append(self.tf_graph[bn_ops[4].name][0])
bn_ops.append(self.tf_graph[bn_ops[3].name][0])
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
input_name = get_input_tensor(bn_ops[3], 0).name
gamma = get_input_tensor(bn_ops[2], 1).name
beta = get_input_tensor(bn_ops[5], 0).name
mean = get_input_tensor(bn_ops[4], 0).name
variance = get_input_tensor(bn_ops[0], 0).name
op_def.name = op.name[:-4] # remove /add
op_def.type = 'BatchNorm'
if self.device == 'gpu':
op_def.input.extend([input_name])
for tensor_name in [gamma, beta, mean, variance]:
output_name = self.add_buffer_to_image(tensor_name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([input_name, gamma, beta, mean, variance])
op_def.output.extend([output.name for output in bn_ops[6].outputs])
self.add_output_shape(bn_ops[6].outputs, op_def)
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'epsilon'
epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float)
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.net_def.op.extend([op_def])
for i in range(0, 7):
self.resolved_ops[bn_ops[i].name] = 1
def convert_pooling(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Pooling'
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[op.type]
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(op.get_attr('strides')[1:3])
kernels_arg = op_def.arg.add()
kernels_arg.name = 'kernels'
kernels_arg.ints.extend(op.get_attr('ksize')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
def convert_global_avg_pooling(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Pooling'
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode['AvgPool']
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode['VALID']
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend([1, 1])
kernels_arg = op_def.arg.add()
kernels_arg.name = 'kernels'
kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
def convert_activation(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Activation'
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = activation_name_map[op.type]
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_relu6(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Activation'
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = "RELUX"
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
op_def.output.extend([final_op.outputs[0].name])
self.add_output_shape([final_op.outputs[0]], op_def)
self.net_def.op.extend([op_def])
def convert_batchnorm(self, op):
bn_ops = []
bn_ops.append(op)
for i in range(1, 3):
if len(self.tf_graph[bn_ops[i-1].name]) == 1 \
and self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]:
bn_ops.append(self.tf_graph[bn_ops[i-1].name][0])
else:
raise Exception('Invalid BatchNorm Op')
if len(self.tf_graph[bn_ops[2].name]) == 2 \
and self.tf_graph[bn_ops[2].name][0].type == BATCH_NORM_ORDER[3] \
and self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]:
bn_ops.append(self.tf_graph[bn_ops[2].name][0])
bn_ops.append(self.tf_graph[bn_ops[2].name][1])
else:
raise Exception('Invalid BatchNorm Op')
bn_ops.append(self.tf_graph[bn_ops[4].name][0])
bn_ops.append(self.tf_graph[bn_ops[3].name][0])
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
input_name = get_input_tensor(bn_ops[3], 0).name
gamma = get_input_tensor(bn_ops[2], 1).name
beta = get_input_tensor(bn_ops[5], 0).name
mean = get_input_tensor(bn_ops[4], 0).name
variance = get_input_tensor(bn_ops[0], 0).name
op_def.name = op.name[:-4] # remove /add
op_def.type = 'BatchNorm'
if self.device == 'gpu':
op_def.input.extend([input_name])
for tensor_name in [gamma, beta, mean, variance]:
output_name = self.add_buffer_to_image(tensor_name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([input_name, gamma, beta, mean, variance])
op_def.output.extend([output.name for output in bn_ops[6].outputs])
self.add_output_shape(bn_ops[6].outputs, op_def)
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'epsilon'
epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float)
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.net_def.op.extend([op_def])
for i in range(0, 7):
self.resolved_ops[bn_ops[i].name] = 1
def convert_pooling(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Pooling'
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[op.type]
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(op.get_attr('strides')[1:3])
kernels_arg = op_def.arg.add()
kernels_arg.name = 'kernels'
kernels_arg.ints.extend(op.get_attr('ksize')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
def convert_global_avg_pooling(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Pooling'
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode['AvgPool']
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode['VALID']
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend([1, 1])
kernels_arg = op_def.arg.add()
kernels_arg.name = 'kernels'
kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
def convert_activation(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Activation'
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = activation_name_map[op.type]
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_relu6(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Activation'
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = "RELUX"
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
self.resolved_ops[op.name] = 1
def convert_add(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "AddN"
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_concat(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "Concat"
op_def.input.extend([input.name for input in op.inputs[:-1]])
op_def.output.extend([output.name for output in op.outputs])
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis = get_input_tensor(op, len(op.inputs) - 1).eval().astype(np.int32)
if self.device == 'neon' and axis == 3:
axis = 1
axis_arg.i = axis
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, len(op.inputs) - 1).name)
def convert_resize_bilinear(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "ResizeBilinear"
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'size'
size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add()
size_arg.name = 'align_corners'
size_arg.i = op.get_attr('align_corners')
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 1).name)
def convert_math(self, op, math_type):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
if len(op.inputs) == 1:
op_def.type = "CWise"
op_def.input.extend([input.name for input in op.inputs])
x_arg = op_def.arg.add()
x_arg.name = 'x'
x_arg.f = 0
elif len(op.inputs) >= 2:
input_tensor0 = get_input_tensor(op, 0)
input_tensor1 = get_input_tensor(op, 1)
if input_tensor0.shape == input_tensor1.shape:
op_def.type = "Eltwise"
self.resolved_ops[op.name] = 1
def convert_add(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "AddN"
op_def.input.extend([input.name for input in op.inputs])
else:
op_def.type = "CWise"
x_value = 0
if len(input_tensor1.shape)==4:
op_def.input.extend([op.inputs[1].name])
x_value = get_input_tensor(op, 0).eval().astype(np.float32)
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_concat(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "Concat"
op_def.input.extend([input.name for input in op.inputs[:-1]])
op_def.output.extend([output.name for output in op.outputs])
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis = get_input_tensor(op, len(op.inputs) - 1).eval().astype(np.int32)
if self.device == 'neon' and axis == 3:
axis = 1
axis_arg.i = axis
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, len(op.inputs) - 1).name)
def convert_resize_bilinear(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "ResizeBilinear"
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'size'
size_arg.ints.extend(
get_input_tensor(op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add()
size_arg.name = 'align_corners'
size_arg.i = op.get_attr('align_corners')
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 1).name)
def convert_math(self, op, math_type):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
if len(op.inputs) == 1:
op_def.type = "CWise"
op_def.input.extend([input.name for input in op.inputs])
x_arg = op_def.arg.add()
x_arg.name = 'x'
x_arg.f = 0
elif len(op.inputs) >= 2:
input_tensor0 = get_input_tensor(op, 0)
input_tensor1 = get_input_tensor(op, 1)
if input_tensor0.shape == input_tensor1.shape:
op_def.type = "Eltwise"
op_def.input.extend([input.name for input in op.inputs])
else:
op_def.type = "CWise"
x_value = 0
if len(input_tensor1.shape) == 4:
op_def.input.extend([op.inputs[1].name])
x_value = get_input_tensor(op, 0).eval().astype(np.float32)
else:
op_def.input.extend([op.inputs[0].name])
x_value = get_input_tensor(op, 1).eval().astype(np.float32)
x_arg = op_def.arg.add()
x_arg.name = 'x'
x_arg.f = x_value
type_arg = op_def.arg.add()
type_arg.name = 'type'
type_arg.i = math_type_mode[math_type]
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_depth_to_space(self, op, d2s):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'block_size'
size_arg.i = op.get_attr('block_size')
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_bias_add(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "BiasAdd"
op_def.input.extend([op.inputs[0].name])
if self.device == 'gpu':
output_name = self.add_buffer_to_image(
get_input_tensor(op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([op.inputs[0].name])
x_value = get_input_tensor(op, 1).eval().astype(np.float32)
x_arg = op_def.arg.add()
x_arg.name = 'x'
x_arg.f = x_value
type_arg = op_def.arg.add()
type_arg.name = 'type'
type_arg.i = math_type_mode[math_type]
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_depth_to_space(self, op, d2s):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'block_size'
size_arg.i = op.get_attr('block_size')
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_bias_add(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "BiasAdd"
op_def.input.extend([op.inputs[0].name])
if self.device == 'gpu':
output_name = self.add_buffer_to_image(get_input_tensor(op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(op, 1).name])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.net_def.op.extend([op_def])
self.resolved_ops[op.name] = 1
def convert_space_to_batch(self, op, b2s):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'block_shape'
size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add()
if b2s:
size_arg.name = 'crops'
else:
size_arg.name = 'paddings'
size_arg.ints.extend(get_input_tensor(op, 2).eval().astype(np.int32).flat)
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.unused_tensor.add(get_input_tensor(op, 2).name)
def is_atrous_conv2d(self, op):
return op.type == 'SpaceToBatchND' and\
len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Conv2D'
def convert_atrous_conv2d(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
conv_op = self.tf_graph[op.name][0]
op_def.name = conv_op.name
op_def.type = conv_op.type
self.transpose_filter_tensor[get_input_tensor(conv_op, 1).name] = (0, 1, 3, 2)
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
output_name = self.add_buffer_to_image(get_input_tensor(conv_op, 1).name, "CONV2D_FILTER")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(op, 0).name])
op_def.input.extend([get_input_tensor(conv_op, 1).name])
dilation_arg = op_def.arg.add()
dilation_arg.name = 'dilations'
dilation_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat
if len(padding_values) > 0 and padding_values[0] > 0:
padding_arg.i = padding_mode['SAME']
else:
padding_arg.i = padding_mode['VALID']
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.unused_tensor.add(get_input_tensor(op, 2).name)
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend([1, 1])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
final_op = conv_op
self.resolved_ops[op.name] = 1
self.resolved_ops[conv_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd' :
bias_add_op = self.tf_graph[final_op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 \
and self.tf_graph[final_op.name][0].type == 'BatchToSpaceND':
final_op = self.tf_graph[final_op.name][0]
self.resolved_ops[final_op.name] = 1
self.unused_tensor.add(get_input_tensor(final_op, 1).name)
self.unused_tensor.add(get_input_tensor(final_op, 2).name)
else:
raise Exception('Convert atrous conv error: no BatchToSpaceND op')
if len(self.tf_graph[final_op.name]) == 1 \
and self.tf_graph[final_op.name][0].type == 'Relu':
relu_op = self.tf_graph[final_op.name][0]
op_def.type = "FusedConv2D"
fused_relu_arg = op_def.arg.add()
fused_relu_arg.name = 'activation'
fused_relu_arg.s = "RELU"
final_op = relu_op
self.resolved_ops[relu_op.name] = 1
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
def is_softmax(self, op):
return op.type == 'Softmax' and \
len(self.tf_parents[op.name]) == 1 and self.tf_parents[op.name][0].type == 'Reshape' and \
len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Reshape'
def convert_softmax(self, softmax_op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
# deal with first Reshape op
parent_reshape_op = self.tf_parents[softmax_op.name][0]
self.unused_tensor.add(get_input_tensor(parent_reshape_op, 1).name)
self.resolved_ops[parent_reshape_op.name] = 1
# FIXME: hardcode for inception_v3
# remove squeeze if exist
squeeze_op = self.tf_parents[parent_reshape_op.name][0]
if squeeze_op.type == 'Squeeze':
op_def.input.extend([squeeze_op.inputs[0].name])
self.resolved_ops[squeeze_op.name] = 1
# remove shape if exist
children_ops = self.tf_graph[squeeze_op.name]
print children_ops
if len(children_ops) > 1 and children_ops[0].type == 'Shape':
self.unused_tensor.add(get_input_tensor(children_ops[1], 0).name)
self.resolved_ops[children_ops[1].name] = 1
else:
op_def.input.extend([parent_reshape_op.inputs[0].name])
# deal with Softmax op
op_def.name = softmax_op.name
op_def.type = softmax_op.type
self.resolved_ops[softmax_op.name] = 1
# deal with last Reshape op
reshape_op = self.tf_graph[softmax_op.name][0]
self.unused_tensor.add(get_input_tensor(reshape_op, 1).name)
shape = [dim.value for dim in reshape_op.outputs[0].shape]
if len(shape) == 2:
shape = [1, 1, shape[0], shape[1]]
op_def.output.extend([output.name for output in reshape_op.outputs])
self.add_output_shape([shape], op_def)
self.resolved_ops[reshape_op.name] = 1
def convert_normal_op(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def replace_in_out_name(self, input_names, output_names):
in_names = set([input_name + ":0" for input_name in input_names])
out_names = set([output_name + ":0" for output_name in output_names])
for op in self.net_def.op:
if op.input[0] in in_names:
op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0]
if op.output[0] in out_names:
op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0]
def convert(self, input_nodes, output_nodes):
if self.device == 'gpu':
self.add_gpu_input_transform(input_nodes)
if self.device == 'neon':
self.add_neon_input_transform(input_nodes)
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
if op.type in ['Placeholder', 'Identity']:
op_def.input.extend([get_input_tensor(op, 1).name])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.net_def.op.extend([op_def])
self.resolved_ops[op.name] = 1
pass
elif op.type == 'Const':
pass
elif op.type == 'Reshape':
self.convert_reshape(op)
elif self.is_atrous_conv2d(op):
self.convert_atrous_conv2d(op)
elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
if self.check_winograd_conv(op):
self.convert_winograd_conv(op)
def convert_space_to_batch(self, op, b2s):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'block_shape'
size_arg.ints.extend(
get_input_tensor(op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add()
if b2s:
size_arg.name = 'crops'
else:
self.convert_conv2d(op)
elif op.type == 'FusedBatchNorm':
self.convert_fused_batchnorm(op)
elif op.type == 'Add' and op.name.endswith('batchnorm/add'):
self.convert_batchnorm(op)
elif op.type == 'AvgPool' or op.type == 'MaxPool':
self.convert_pooling(op)
elif op.type == 'Relu6':
self.convert_relu6(op)
elif op.type == 'Add':
self.convert_add(op)
elif op.type == 'ConcatV2':
self.convert_concat(op)
elif op.type == 'ResizeBilinear':
self.convert_resize_bilinear(op)
elif op.type == 'BiasAdd':
self.convert_bias_add(op)
elif op.type == 'SpaceToBatchND':
self.convert_space_to_batch(op, False)
elif op.type == 'BatchToSpaceND':
self.convert_space_to_batch(op, True)
elif op.type == 'DepthToSpace':
self.convert_depth_to_space(op, True)
elif op.type == 'SpaceToDepth':
self.convert_depth_to_space(op, False)
elif op.type in ['Neg', 'neg', 'Negative', 'negative']:
self.convert_math(op, 'NEG')
elif op.type == 'Mul':
self.convert_math(op, 'MUL')
elif op.type == 'Sub':
self.convert_math(op, 'SUB')
elif self.is_softmax(op):
self.convert_softmax(op)
elif op.type in ['Relu', 'Sigmoid', 'Tanh']:
self.convert_activation(op)
# FIXME: hardcode for inception_v3
elif op.type in ['Squeeze', 'Shape']:
size_arg.name = 'paddings'
size_arg.ints.extend(
get_input_tensor(op, 2).eval().astype(np.int32).flat)
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
elif op.type == 'Mean':
# Global avg pooling
reduce_dims = op.inputs[1].eval()
if reduce_dims[0] == 1 and reduce_dims[1] == 2:
self.convert_global_avg_pooling(op)
self.unused_tensor.add(op.inputs[1].name)
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.unused_tensor.add(get_input_tensor(op, 2).name)
def is_atrous_conv2d(self, op):
return op.type == 'SpaceToBatchND' and \
len(self.tf_graph[op.name]) == 1 and \
self.tf_graph[op.name][0].type == 'Conv2D'
def convert_atrous_conv2d(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
conv_op = self.tf_graph[op.name][0]
op_def.name = conv_op.name
op_def.type = conv_op.type
self.transpose_filter_tensor[get_input_tensor(conv_op,
1).name] = (0, 1, 3, 2)
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
output_name = self.add_buffer_to_image(
get_input_tensor(conv_op, 1).name, "CONV2D_FILTER")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(op, 0).name])
op_def.input.extend([get_input_tensor(conv_op, 1).name])
dilation_arg = op_def.arg.add()
dilation_arg.name = 'dilations'
dilation_arg.ints.extend(
get_input_tensor(op, 1).eval().astype(np.int32).flat)
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat
if len(padding_values) > 0 and padding_values[0] > 0:
padding_arg.i = padding_mode['SAME']
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
#elif op.type in ['']:
# self.convert_normal_op(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
padding_arg.i = padding_mode['VALID']
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.unused_tensor.add(get_input_tensor(op, 2).name)
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend([1, 1])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
final_op = conv_op
self.resolved_ops[op.name] = 1
self.resolved_ops[conv_op.name] = 1
if len(self.tf_graph[final_op.name]
) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd':
bias_add_op = self.tf_graph[final_op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(
get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 and \
self.tf_graph[final_op.name][0].type == 'BatchToSpaceND':
final_op = self.tf_graph[final_op.name][0]
self.resolved_ops[final_op.name] = 1
self.unused_tensor.add(get_input_tensor(final_op, 1).name)
self.unused_tensor.add(get_input_tensor(final_op, 2).name)
else:
raise Exception('Convert atrous conv error: no BatchToSpaceND op')
if len(self.tf_graph[final_op.name]) == 1 and \
self.tf_graph[final_op.name][0].type == 'Relu':
relu_op = self.tf_graph[final_op.name][0]
op_def.type = "FusedConv2D"
fused_relu_arg = op_def.arg.add()
fused_relu_arg.name = 'activation'
fused_relu_arg.s = "RELU"
final_op = relu_op
self.resolved_ops[relu_op.name] = 1
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
def is_softmax(self, op):
return op.type == 'Softmax' and \
len(self.tf_parents[op.name]) == 1 and \
self.tf_parents[op.name][0].type == 'Reshape' and \
len(self.tf_graph[op.name]) == 1 and \
self.tf_graph[op.name][0].type == 'Reshape'
def convert_softmax(self, softmax_op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
# deal with first Reshape op
parent_reshape_op = self.tf_parents[softmax_op.name][0]
self.unused_tensor.add(get_input_tensor(parent_reshape_op, 1).name)
self.resolved_ops[parent_reshape_op.name] = 1
# FIXME: hardcode for inception_v3
# remove squeeze if exist
squeeze_op = self.tf_parents[parent_reshape_op.name][0]
if squeeze_op.type == 'Squeeze':
op_def.input.extend([squeeze_op.inputs[0].name])
self.resolved_ops[squeeze_op.name] = 1
# remove shape if exist
children_ops = self.tf_graph[squeeze_op.name]
print children_ops
if len(children_ops) > 1 and children_ops[0].type == 'Shape':
self.unused_tensor.add(
get_input_tensor(children_ops[1], 0).name)
self.resolved_ops[children_ops[1].name] = 1
else:
op_def.input.extend([parent_reshape_op.inputs[0].name])
# deal with Softmax op
op_def.name = softmax_op.name
op_def.type = softmax_op.type
self.resolved_ops[softmax_op.name] = 1
# deal with last Reshape op
reshape_op = self.tf_graph[softmax_op.name][0]
self.unused_tensor.add(get_input_tensor(reshape_op, 1).name)
shape = [dim.value for dim in reshape_op.outputs[0].shape]
if len(shape) == 2:
shape = [1, 1, shape[0], shape[1]]
op_def.output.extend([output.name for output in reshape_op.outputs])
self.add_output_shape([shape], op_def)
self.resolved_ops[reshape_op.name] = 1
def convert_normal_op(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
elif op.type == 'Const':
self.convert_tensor(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
def replace_in_out_name(self, input_names, output_names):
in_names = set([input_name + ":0" for input_name in input_names])
out_names = set([output_name + ":0" for output_name in output_names])
for op in self.net_def.op:
if op.input[0] in in_names:
op.input[0] = MACE_INPUT_NODE_NAME + '_' + op.input[0]
if op.output[0] in out_names:
op.output[0] = MACE_OUTPUT_NODE_NAME + '_' + op.output[0]
if self.device == 'gpu':
self.add_gpu_output_transform(output_nodes)
def convert(self, input_nodes, output_nodes):
if self.device == 'gpu':
self.add_gpu_input_transform(input_nodes)
if self.device == 'neon':
self.add_neon_input_transform(input_nodes)
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
if op.type in ['Placeholder', 'Identity']:
self.resolved_ops[op.name] = 1
pass
elif op.type == 'Const':
pass
elif op.type == 'Reshape':
self.convert_reshape(op)
elif self.is_atrous_conv2d(op):
self.convert_atrous_conv2d(op)
elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
if self.check_winograd_conv(op):
self.convert_winograd_conv(op)
else:
self.convert_conv2d(op)
elif op.type == 'FusedBatchNorm':
self.convert_fused_batchnorm(op)
elif op.type == 'Add' and op.name.endswith('batchnorm/add'):
self.convert_batchnorm(op)
elif op.type == 'AvgPool' or op.type == 'MaxPool':
self.convert_pooling(op)
elif op.type == 'Relu6':
self.convert_relu6(op)
elif op.type == 'Add':
self.convert_add(op)
elif op.type == 'ConcatV2':
self.convert_concat(op)
elif op.type == 'ResizeBilinear':
self.convert_resize_bilinear(op)
elif op.type == 'BiasAdd':
self.convert_bias_add(op)
elif op.type == 'SpaceToBatchND':
self.convert_space_to_batch(op, False)
elif op.type == 'BatchToSpaceND':
self.convert_space_to_batch(op, True)
elif op.type == 'DepthToSpace':
self.convert_depth_to_space(op, True)
elif op.type == 'SpaceToDepth':
self.convert_depth_to_space(op, False)
elif op.type in ['Neg', 'neg', 'Negative', 'negative']:
self.convert_math(op, 'NEG')
elif op.type == 'Mul':
self.convert_math(op, 'MUL')
elif op.type == 'Sub':
self.convert_math(op, 'SUB')
elif self.is_softmax(op):
self.convert_softmax(op)
elif op.type in ['Relu', 'Sigmoid', 'Tanh']:
self.convert_activation(op)
# FIXME: hardcode for inception_v3
elif op.type in ['Squeeze', 'Shape']:
self.resolved_ops[op.name] = 1
elif op.type == 'Mean':
# Global avg pooling
reduce_dims = op.inputs[1].eval()
if reduce_dims[0] == 1 and reduce_dims[1] == 2:
self.convert_global_avg_pooling(op)
self.unused_tensor.add(op.inputs[1].name)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name,
op.type))
# elif op.type in ['']:
# self.convert_normal_op(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name,
op.type))
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
elif op.type == 'Const':
self.convert_tensor(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name,
op.type))
if self.device == 'gpu':
self.add_gpu_output_transform(output_nodes)
if self.device == 'neon':
self.add_neon_output_transform(output_nodes)
if self.device == 'neon':
self.add_neon_output_transform(output_nodes)
if self.device == 'cpu':
self.replace_in_out_name(input_nodes, output_nodes)
if self.device == 'cpu':
self.replace_in_out_name(input_nodes, output_nodes)
for key in self.resolved_ops:
if self.resolved_ops[key] != 1:
print 'Unresolve Op: %s' % key
for key in self.resolved_ops:
if self.resolved_ops[key] != 1:
print 'Unresolve Op: %s' % key
class Optimizer:
def __init__(self, net_def, device):
self.net_def = net_def
self.device = device
self.mace_graph = {}
self.tensor_map = {}
for op in net_def.op:
for input_name in op.input:
if input_name not in self.mace_graph:
self.mace_graph[input_name] = []
self.mace_graph[input_name].append(op)
for tensor in net_def.tensors:
self.tensor_map[tensor.name] = tensor
def get_buffer_tensor_name(self, name):
if self.device == 'gpu':
return name[:-6] + name[-2:]
else:
return name
def fold_batch_norm(self):
unused_tensors = set()
new_tensors = []
new_net = mace_pb2.NetDef()
resolved_ops = set()
for op in self.net_def.op:
if op.name in resolved_ops:
pass
elif op.type == 'DepthwiseConv2d' and len(op.output) == 1 \
and self.mace_graph[op.output[0]][0].type == 'FoldedBatchNorm':
depthwise_conv2d_op = op
folded_bn_op = self.mace_graph[op.output[0]][0]
weight_buffer_name = self.get_buffer_tensor_name(depthwise_conv2d_op.input[1])
weight_tensor = self.tensor_map[weight_buffer_name]
scale_buffer_name = self.get_buffer_tensor_name(folded_bn_op.input[1])
offset_buffer_name = self.get_buffer_tensor_name(folded_bn_op.input[2])
scale_tensor = self.tensor_map[scale_buffer_name]
weight_shape = weight_tensor.dims
idx = 0
if self.device == 'neon': # OIHW
for oc in range(weight_shape[0]):
for ic in range(weight_shape[1]):
for i in range(weight_shape[2]):
for j in range(weight_shape[3]):
weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[0] + oc]
idx += 1
else: # HWIO
for i in range(weight_shape[0]):
for j in range(weight_shape[1]):
for ic in range(weight_shape[2]):
for oc in range(weight_shape[3]):
weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[3] + oc]
idx += 1
new_tensors.append(weight_tensor)
unused_tensors.add(weight_tensor.name)
unused_tensors.add(scale_tensor.name)
class Optimizer:
def __init__(self, net_def, device):
self.net_def = net_def
self.device = device
self.mace_graph = {}
self.tensor_map = {}
for op in net_def.op:
for input_name in op.input:
if input_name not in self.mace_graph:
self.mace_graph[input_name] = []
self.mace_graph[input_name].append(op)
for tensor in net_def.tensors:
self.tensor_map[tensor.name] = tensor
def get_buffer_tensor_name(self, name):
if self.device == 'gpu':
scale_b2i_op = self.mace_graph[scale_buffer_name][0]
offset_b2i_op = self.mace_graph[offset_buffer_name][0]
resolved_ops.add(scale_b2i_op.name)
resolved_ops.add(offset_b2i_op.name)
new_net.op.extend([offset_b2i_op])
resolved_ops.add(depthwise_conv2d_op.name)
resolved_ops.add(folded_bn_op.name)
offset_tensor_name = folded_bn_op.input[2]
depthwise_conv2d_op.input.extend([offset_tensor_name])
for arg in folded_bn_op.arg:
if arg.name == 'activation':
act_arg = depthwise_conv2d_op.arg.add()
act_arg.name = arg.name
act_arg.s = arg.s
elif arg.name == 'max_limit':
act_arg = depthwise_conv2d_op.arg.add()
act_arg.name = arg.name
act_arg.f = arg.f
depthwise_conv2d_op.output[0] = folded_bn_op.output[0]
new_net.op.extend([depthwise_conv2d_op])
else:
new_net.op.extend([op])
for tensor in self.net_def.tensors:
if tensor.name in unused_tensors:
pass
else:
new_net.tensors.extend([tensor])
for tensor in new_tensors:
new_net.tensors.extend([tensor])
return new_net
def optimize(self):
new_net = self.fold_batch_norm()
return new_net
return name[:-6] + name[-2:]
else:
return name
def fold_batch_norm(self):
unused_tensors = set()
new_tensors = []
new_net = mace_pb2.NetDef()
resolved_ops = set()
for op in self.net_def.op:
if op.name in resolved_ops:
pass
elif op.type == 'DepthwiseConv2d' and len(op.output) == 1 and \
self.mace_graph[op.output[0]][0].type == 'FoldedBatchNorm':
depthwise_conv2d_op = op
folded_bn_op = self.mace_graph[op.output[0]][0]
weight_buffer_name = self.get_buffer_tensor_name(
depthwise_conv2d_op.input[1])
weight_tensor = self.tensor_map[weight_buffer_name]
scale_buffer_name = self.get_buffer_tensor_name(
folded_bn_op.input[1])
offset_buffer_name = self.get_buffer_tensor_name(
folded_bn_op.input[2])
scale_tensor = self.tensor_map[scale_buffer_name]
weight_shape = weight_tensor.dims
idx = 0
if self.device == 'neon': # OIHW
for oc in range(weight_shape[0]):
for ic in range(weight_shape[1]):
for i in range(weight_shape[2]):
for j in range(weight_shape[3]):
weight_tensor.float_data[
idx] *= scale_tensor.float_data[
ic * weight_shape[0] + oc]
idx += 1
else: # HWIO
for i in range(weight_shape[0]):
for j in range(weight_shape[1]):
for ic in range(weight_shape[2]):
for oc in range(weight_shape[3]):
weight_tensor.float_data[
idx] *= scale_tensor.float_data[
ic * weight_shape[3] + oc]
idx += 1
new_tensors.append(weight_tensor)
unused_tensors.add(weight_tensor.name)
unused_tensors.add(scale_tensor.name)
if self.device == 'gpu':
scale_b2i_op = self.mace_graph[scale_buffer_name][0]
offset_b2i_op = self.mace_graph[offset_buffer_name][0]
resolved_ops.add(scale_b2i_op.name)
resolved_ops.add(offset_b2i_op.name)
new_net.op.extend([offset_b2i_op])
resolved_ops.add(depthwise_conv2d_op.name)
resolved_ops.add(folded_bn_op.name)
offset_tensor_name = folded_bn_op.input[2]
depthwise_conv2d_op.input.extend([offset_tensor_name])
for arg in folded_bn_op.arg:
if arg.name == 'activation':
act_arg = depthwise_conv2d_op.arg.add()
act_arg.name = arg.name
act_arg.s = arg.s
elif arg.name == 'max_limit':
act_arg = depthwise_conv2d_op.arg.add()
act_arg.name = arg.name
act_arg.f = arg.f
depthwise_conv2d_op.output[0] = folded_bn_op.output[0]
new_net.op.extend([depthwise_conv2d_op])
else:
new_net.op.extend([op])
for tensor in self.net_def.tensors:
if tensor.name in unused_tensors:
pass
else:
new_net.tensors.extend([tensor])
for tensor in new_tensors:
new_net.tensors.extend([tensor])
return new_net
def optimize(self):
new_net = self.fold_batch_norm()
return new_net
def add_shape_info(input_graph_def, input_nodes, input_shapes):
inputs_replaced_graph = graph_pb2.GraphDef()
for node in input_graph_def.node:
if node.name in input_nodes:
idx = input_nodes.index(node.name)
input_shape = input_shapes[idx]
placeholder_node = copy.deepcopy(node)
placeholder_node.attr.clear()
placeholder_node.attr['shape'].shape.dim.extend([
tensor_shape_pb2.TensorShapeProto.Dim(size=i) for i in input_shape
])
placeholder_node.attr['dtype'].CopyFrom(node.attr['dtype'])
inputs_replaced_graph.node.extend([placeholder_node])
else:
inputs_replaced_graph.node.extend([copy.deepcopy(node)])
return inputs_replaced_graph
def convert_to_mace_pb(model_file, input_node, input_shape, output_node, data_type, device, winograd):
net_def = mace_pb2.NetDef()
dt = data_type_map[data_type]
input_graph_def = tf.GraphDef()
with gfile.Open(model_file, "rb") as f:
data = f.read()
input_graph_def.ParseFromString(data)
input_nodes = [x for x in input_node.split(',')]
input_shapes = []
if input_shape != "":
input_shape_strs = [x for x in input_shape.split(':')]
for shape_str in input_shape_strs:
input_shapes.extend([[int(x) for x in shape_str.split(',')]])
output_nodes = [x for x in output_node.split(',')]
assert len(input_nodes) == len(input_shapes)
input_graph_def = add_shape_info(input_graph_def, input_nodes, input_shapes)
with tf.Session() as session:
with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name="")
ops = graph.get_operations()
converter = TFConverter(ops, net_def, dt, device, winograd)
converter.convert(input_nodes, output_nodes)
optimizer = Optimizer(net_def, device)
net_def = optimizer.optimize()
print "Model Converted."
if device == 'gpu':
print "start optimize memory."
mem_optimizer = memory_optimizer.MemoryOptimizer(net_def)
mem_optimizer.optimize()
print "Memory optimization done."
return net_def
inputs_replaced_graph = graph_pb2.GraphDef()
for node in input_graph_def.node:
if node.name in input_nodes:
idx = input_nodes.index(node.name)
input_shape = input_shapes[idx]
placeholder_node = copy.deepcopy(node)
placeholder_node.attr.clear()
placeholder_node.attr['shape'].shape.dim.extend([
tensor_shape_pb2.TensorShapeProto.Dim(size=i)
for i in input_shape
])
placeholder_node.attr['dtype'].CopyFrom(node.attr['dtype'])
inputs_replaced_graph.node.extend([placeholder_node])
else:
inputs_replaced_graph.node.extend([copy.deepcopy(node)])
return inputs_replaced_graph
def convert_to_mace_pb(model_file, input_node, input_shape, output_node,
data_type, device, winograd):
net_def = mace_pb2.NetDef()
dt = data_type_map[data_type]
input_graph_def = tf.GraphDef()
with gfile.Open(model_file, "rb") as f:
data = f.read()
input_graph_def.ParseFromString(data)
input_nodes = [x for x in input_node.split(',')]
input_shapes = []
if input_shape != "":
input_shape_strs = [x for x in input_shape.split(':')]
for shape_str in input_shape_strs:
input_shapes.extend([[int(x) for x in shape_str.split(',')]])
output_nodes = [x for x in output_node.split(',')]
assert len(input_nodes) == len(input_shapes)
input_graph_def = add_shape_info(input_graph_def, input_nodes,
input_shapes)
with tf.Session() as session:
with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name="")
ops = graph.get_operations()
converter = TFConverter(ops, net_def, dt, device, winograd)
converter.convert(input_nodes, output_nodes)
optimizer = Optimizer(net_def, device)
net_def = optimizer.optimize()
print "Model Converted."
if device == 'gpu':
print "start optimize memory."
mem_optimizer = memory_optimizer.MemoryOptimizer(net_def)
mem_optimizer.optimize()
print "Memory optimization done."
return net_def
......@@ -6,452 +6,521 @@ from dsp_ops import DspOps
from mace.python.tools import graph_util
from mace.python.tools.convert_util import tf_dtype_2_mace_dtype
# converter --input ../libcv/quantized_model.pb --output quantized_model_dsp.pb \
# --runtime dsp --input_node input_node --output_node output_node
# converter --input ../libcv/quantized_model.pb \
# --output quantized_model_dsp.pb \
# --runtime dsp --input_node input_node \
# --output_node output_node
padding_mode = {
'NA': 0,
'SAME': 1,
'VALID': 2,
'MIRROR_REFLECT': 3,
'MIRROR_SYMMETRIC': 4,
'SAME_CAFFE': 5
'NA': 0,
'SAME': 1,
'VALID': 2,
'MIRROR_REFLECT': 3,
'MIRROR_SYMMETRIC': 4,
'SAME_CAFFE': 5
}
def get_tensor_name_from_op(op_name, port):
return op_name + ':' + str(port)
return op_name + ':' + str(port)
def get_node_from_map(op_map, op_or_tensor_name):
op_name = op_or_tensor_name.split(':')[0]
return op_map[op_name]
op_name = op_or_tensor_name.split(':')[0]
return op_map[op_name]
def get_op_and_port_from_tensor(tensor_name):
op, port = tensor_name.split(':')
port = int(port)
return op, port
op, port = tensor_name.split(':')
port = int(port)
return op, port
def max_elem_size(tensor):
if len(tensor.shape.as_list()) == 0:
return tensor.dtype.size
else:
return reduce(mul, tensor.shape.as_list()) * tensor.dtype.size
if len(tensor.shape.as_list()) == 0:
return tensor.dtype.size
else:
return reduce(mul, tensor.shape.as_list()) * tensor.dtype.size
def find_dtype(tensor_dtype):
if tensor_dtype == tf.float32:
return mace_pb2.DT_FLOAT
elif tensor_dtype == tf.uint8 or tensor_dtype == tf.quint8:
return mace_pb2.DT_UINT8
elif tensor_dtype == tf.int32 or tensor_dtype == tf.qint32:
return mace_pb2.DT_INT32
else:
raise Exception('Unsupported data type: ', tensor_dtype)
if tensor_dtype == tf.float32:
return mace_pb2.DT_FLOAT
elif tensor_dtype == tf.uint8 or tensor_dtype == tf.quint8:
return mace_pb2.DT_UINT8
elif tensor_dtype == tf.int32 or tensor_dtype == tf.qint32:
return mace_pb2.DT_INT32
else:
raise Exception('Unsupported data type: ', tensor_dtype)
def has_padding_and_strides(op):
return 'padding' in op.node_def.attr and 'strides' in op.node_def.attr
return 'padding' in op.node_def.attr and 'strides' in op.node_def.attr
def is_node_flatten_reshape(op):
return op.type == 'Reshape' and len(op.outputs[0].shape) == 1
return op.type == 'Reshape' and len(op.outputs[0].shape) == 1
def get_input_tensor(op, index):
input_tensor = op.inputs[index]
if input_tensor.op.type == 'Reshape':
input_tensor = get_input_tensor(input_tensor.op, 0)
return input_tensor
input_tensor = op.inputs[index]
if input_tensor.op.type == 'Reshape':
input_tensor = get_input_tensor(input_tensor.op, 0)
return input_tensor
def add_shape_const_node(net_def, op, values, name):
print ('Add const node: ', op.name + '/' + name)
tensor = net_def.tensors.add()
node_name = op.name + '/' + name
tensor.name = node_name + ':0'
tensor.data_type = mace_pb2.DT_INT32
tensor.dims.extend(values)
return tensor.name
print('Add const node: ', op.name + '/' + name)
tensor = net_def.tensors.add()
node_name = op.name + '/' + name
tensor.name = node_name + ':0'
tensor.data_type = mace_pb2.DT_INT32
tensor.dims.extend(values)
return tensor.name
def convert_op_outputs(mace_op_def, tf_op):
mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype)
for output in tf_op.outputs])
output_shapes = []
for output in tf_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
mace_op_def.output_shape.extend(output_shapes)
mace_op_def.output_type.extend(
[tf_dtype_2_mace_dtype(output.dtype) for output in tf_op.outputs])
output_shapes = []
for output in tf_op.outputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
mace_op_def.output_shape.extend(output_shapes)
def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
first_op = unresolved_ops[0]
print ('Op: ', first_op.name, first_op.type, first_op.outputs[0].shape)
first_op = unresolved_ops[0]
print('Op: ', first_op.name, first_op.type, first_op.outputs[0].shape)
if first_op.name in resolved_ops:
pass
elif first_op.type == 'Const':
print('Add const node: ', first_op.name)
tf_tensor = first_op.outputs[0].eval()
tensor = net_def.tensors.add()
tensor.name = first_op.outputs[0].name
tensor.data_type = find_dtype(first_op.outputs[0].dtype)
shape = list(tf_tensor.shape)
if len(shape) > 0:
tensor.dims.extend(shape)
if first_op.outputs[0].dtype == tf.float32:
tensor.float_data.extend(tf_tensor.astype(float).flat)
elif first_op.outputs[0].dtype == tf.int32 or \
first_op.outputs[0].dtype == tf.int8 or \
first_op.outputs[0].dtype == tf.int16 or \
first_op.outputs[0].dtype == tf.quint8 or \
first_op.outputs[0].dtype == tf.quint16:
tensor.int32_data.extend(tf_tensor.astype(int).flat)
if first_op.name in resolved_ops:
pass
elif first_op.type == 'Const':
print ('Add const node: ', first_op.name)
tf_tensor = first_op.outputs[0].eval()
tensor = net_def.tensors.add()
tensor.name = first_op.outputs[0].name
tensor.data_type = find_dtype(first_op.outputs[0].dtype)
shape = list(tf_tensor.shape)
if len(shape) > 0:
tensor.dims.extend(shape)
if first_op.outputs[0].dtype == tf.float32:
tensor.float_data.extend(tf_tensor.astype(float).flat)
elif first_op.outputs[0].dtype == tf.int32 or \
first_op.outputs[0].dtype == tf.int8 or \
first_op.outputs[0].dtype == tf.int16 or \
first_op.outputs[0].dtype == tf.quint8 or \
first_op.outputs[0].dtype == tf.quint16:
tensor.int32_data.extend(tf_tensor.astype(int).flat)
else:
op_def = net_def.op.add()
op_def.name = first_op.name
op_def.type = dsp_ops.map_nn_op(first_op.type)
op_def.padding = padding_mode['NA']
if len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \
and len(first_op.outputs[0].consumers()) > 0 \
and (first_op.outputs[0].consumers()[0].type == 'SpaceToBatchND' \
or first_op.outputs[0].consumers()[0].type == 'BatchToSpaceND'):
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[1]
max_tensor = first_op.inputs[2]
s2b_op = first_op.outputs[0].consumers()[0]
reshape_op = s2b_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
resolved_ops.add(s2b_op.name)
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
op_def.name = quantize_op.name
op_def.type = dsp_ops.map_nn_op('Quantized' + s2b_op.type)
op_def.input.append(input_tensor.name)
op_def.input.extend([t.name for t in s2b_op.inputs[1:]])
op_def.input.extend([min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_op.outputs])
convert_op_outputs(op_def, quantize_op)
elif len(first_op.outputs) > 0 and first_op.type == 'QuantizedReshape' \
and len(first_op.outputs[0].consumers()) > 0 \
and first_op.outputs[0].consumers()[0].type == 'Dequantize' \
and len(first_op.outputs[0].consumers()[0].outputs[0].consumers()) > 0 \
and first_op.outputs[0].consumers()[0].outputs[0].consumers()[0].type == 'Softmax':
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[2]
max_tensor = first_op.inputs[3]
dequantize_op = first_op.outputs[0].consumers()[0]
softmax_op = dequantize_op.outputs[0].consumers()[0]
reshape_op = softmax_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
quantize_reshape_op = quantize_op.outputs[0].consumers()[0]
resolved_ops.add(dequantize_op.name)
resolved_ops.add(softmax_op.name)
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
resolved_ops.add(quantize_reshape_op.name)
op_def.name = quantize_reshape_op.name
op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax')
op_def.input.extend([input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_reshape_op.outputs])
convert_op_outputs(op_def, quantize_reshape_op)
elif len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \
and len(first_op.outputs[0].consumers()) > 0 \
and first_op.outputs[0].consumers()[0].type == 'Tanh':
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[1]
max_tensor = first_op.inputs[2]
tanh_op = first_op.outputs[0].consumers()[0]
# if not last op
resolved_ops.add(tanh_op.name)
if tanh_op.outputs[0].consumers():
reshape_op = tanh_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
op_def.name = quantize_op.name
op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type)
op_def.input.extend([input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_op.outputs])
convert_op_outputs(op_def, quantize_op)
# tanh is last op
else:
op_def.name = tanh_op.name + '/QuantizedTanh'
op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type)
op_def.input.extend([input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([max_elem_size(input_tensor),
max_elem_size(min_tensor),
max_elem_size(max_tensor)])
op_def.output_type.extend([mace_pb2.DT_UINT8, mace_pb2.DT_FLOAT, mace_pb2.DT_FLOAT])
output_shapes = []
for output in first_op.inputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
new_tanh_op_def = net_def.op.add()
new_tanh_op_def.name = tanh_op.name
new_tanh_op_def.type = dsp_ops.map_nn_op('Dequantize')
new_tanh_op_def.input.extend([get_tensor_name_from_op(op_def.name, 0),
get_tensor_name_from_op(op_def.name, 1),
get_tensor_name_from_op(op_def.name, 2)])
new_tanh_op_def.out_max_byte_size.extend([max_elem_size(tanh_op.outputs[0])])
convert_op_outputs(new_tanh_op_def, tanh_op)
elif has_padding_and_strides(first_op):
op_def.padding = padding_mode[first_op.get_attr('padding')]
op_def.input.extend([t.name for t in first_op.inputs])
if 'ksize' in first_op.node_def.attr:
ksize = first_op.get_attr('ksize')
ksize_tensor = add_shape_const_node(net_def, first_op, ksize, 'ksize')
op_def.input.extend([ksize_tensor])
strides = first_op.get_attr('strides')
strides_tensor = add_shape_const_node(net_def, first_op, strides, 'strides')
op_def.input.extend([strides_tensor])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif is_node_flatten_reshape(first_op):
op_def.type = 'Flatten'
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif dsp_ops.has_op(first_op.type):
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
else:
raise Exception('Unsupported op: ', first_op)
op_def = net_def.op.add()
op_def.name = first_op.name
op_def.type = dsp_ops.map_nn_op(first_op.type)
op_def.padding = padding_mode['NA']
if len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \
and len(first_op.outputs[0].consumers()) > 0 \
and (first_op.outputs[0].consumers()[0].type == 'SpaceToBatchND' or
first_op.outputs[0].consumers()[0].type == 'BatchToSpaceND'):
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[1]
max_tensor = first_op.inputs[2]
s2b_op = first_op.outputs[0].consumers()[0]
reshape_op = s2b_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
resolved_ops.add(s2b_op.name)
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
op_def.name = quantize_op.name
op_def.type = dsp_ops.map_nn_op('Quantized' + s2b_op.type)
op_def.input.append(input_tensor.name)
op_def.input.extend([t.name for t in s2b_op.inputs[1:]])
op_def.input.extend([min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend(
[max_elem_size(out) for out in quantize_op.outputs])
convert_op_outputs(op_def, quantize_op)
elif len(first_op.outputs) > 0 and \
first_op.type == 'QuantizedReshape' and \
len(first_op.outputs[0].consumers()) > 0 and \
first_op.outputs[0].consumers()[0].type == 'Dequantize' and \
len(first_op.outputs[0].consumers()[0].outputs[0].consumers()) \
> 0 and \
first_op.outputs[0].consumers()[0].outputs[0].consumers()[0].type \
== 'Softmax':
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[2]
max_tensor = first_op.inputs[3]
dequantize_op = first_op.outputs[0].consumers()[0]
softmax_op = dequantize_op.outputs[0].consumers()[0]
reshape_op = softmax_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
quantize_reshape_op = quantize_op.outputs[0].consumers()[0]
resolved_ops.add(dequantize_op.name)
resolved_ops.add(softmax_op.name)
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
resolved_ops.add(quantize_reshape_op.name)
op_def.name = quantize_reshape_op.name
op_def.type = dsp_ops.map_nn_op('QuantizedSoftmax')
op_def.input.extend(
[input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend(
[max_elem_size(out) for out in quantize_reshape_op.outputs])
convert_op_outputs(op_def, quantize_reshape_op)
elif len(first_op.outputs) > 0 and first_op.type == 'Dequantize' and \
len(first_op.outputs[0].consumers()) > 0 and \
first_op.outputs[0].consumers()[0].type == 'Tanh':
input_tensor = first_op.inputs[0]
min_tensor = first_op.inputs[1]
max_tensor = first_op.inputs[2]
tanh_op = first_op.outputs[0].consumers()[0]
# if not last op
resolved_ops.add(tanh_op.name)
if tanh_op.outputs[0].consumers():
reshape_op = tanh_op.outputs[0].consumers()[0]
min_op = reshape_op.outputs[0].consumers()[0]
max_op = reshape_op.outputs[0].consumers()[1]
quantize_op = min_op.outputs[0].consumers()[0]
resolved_ops.add(reshape_op.name)
resolved_ops.add(min_op.name)
resolved_ops.add(max_op.name)
resolved_ops.add(quantize_op.name)
op_def.name = quantize_op.name
op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type)
op_def.input.extend(
[input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend(
[max_elem_size(out) for out in quantize_op.outputs])
convert_op_outputs(op_def, quantize_op)
# tanh is last op
else:
op_def.name = tanh_op.name + '/QuantizedTanh'
op_def.type = dsp_ops.map_nn_op('Quantized' + tanh_op.type)
op_def.input.extend(
[input_tensor.name, min_tensor.name, max_tensor.name])
op_def.out_max_byte_size.extend([
max_elem_size(input_tensor),
max_elem_size(min_tensor),
max_elem_size(max_tensor)
])
op_def.output_type.extend(
[mace_pb2.DT_UINT8, mace_pb2.DT_FLOAT, mace_pb2.DT_FLOAT])
output_shapes = []
for output in first_op.inputs:
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
new_tanh_op_def = net_def.op.add()
new_tanh_op_def.name = tanh_op.name
new_tanh_op_def.type = dsp_ops.map_nn_op('Dequantize')
new_tanh_op_def.input.extend([
get_tensor_name_from_op(op_def.name, 0),
get_tensor_name_from_op(op_def.name, 1),
get_tensor_name_from_op(op_def.name, 2)
])
new_tanh_op_def.out_max_byte_size.extend(
[max_elem_size(tanh_op.outputs[0])])
convert_op_outputs(new_tanh_op_def, tanh_op)
elif has_padding_and_strides(first_op):
op_def.padding = padding_mode[first_op.get_attr('padding')]
op_def.input.extend([t.name for t in first_op.inputs])
if 'ksize' in first_op.node_def.attr:
ksize = first_op.get_attr('ksize')
ksize_tensor = add_shape_const_node(net_def, first_op, ksize,
'ksize')
op_def.input.extend([ksize_tensor])
strides = first_op.get_attr('strides')
strides_tensor = add_shape_const_node(net_def, first_op, strides,
'strides')
op_def.input.extend([strides_tensor])
op_def.out_max_byte_size.extend(
[max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif is_node_flatten_reshape(first_op):
op_def.type = 'Flatten'
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend(
[max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
elif dsp_ops.has_op(first_op.type):
op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend(
[max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op)
else:
raise Exception('Unsupported op: ', first_op)
resolved_ops.add(first_op.name)
del unresolved_ops[0]
resolved_ops.add(first_op.name)
del unresolved_ops[0]
def add_output_node(net_def, output_node):
op_def = net_def.op.add()
op_def.name = '__output__'
op_def.type = 'OUTPUT'
op_def.input.extend([get_tensor_name_from_op(output_node, 0)])
op_def = net_def.op.add()
op_def.name = '__output__'
op_def.type = 'OUTPUT'
op_def.input.extend([get_tensor_name_from_op(output_node, 0)])
def reverse_batch_to_space_and_biasadd(net_def):
tensor_map = {}
for tensor in net_def.tensors:
tensor_map[tensor.name] = tensor
op_map = {}
for op in net_def.op:
op_map[op.name] = op
consumers = {}
for op in net_def.op:
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
new_ops = []
skip_ops = set()
visited_ops = set()
for op in net_def.op:
if op.name in visited_ops:
pass
# pattern: QConv -> RR -> R -> QB2S -> QBiasAdd -> RR -> R
success = False
if op.type == 'Requantize_32to8':
biasadd_requantize_op = op
biasadd_op = get_node_from_map(op_map, biasadd_requantize_op.input[0])
if biasadd_op.type == 'QuantizedBiasAdd_8p8to32':
b2s_op = get_node_from_map(op_map, biasadd_op.input[0])
if b2s_op.type == 'QuantizedBatchToSpaceND_8':
conv_requantize_op = get_node_from_map(op_map, b2s_op.input[0])
conv_op = get_node_from_map(op_map, conv_requantize_op.input[0])
if conv_op.type == 'QuantizedConv2d_8x8to32':
new_biasadd_op = mace_pb2.OperatorDef()
new_biasadd_op.CopyFrom(biasadd_op)
new_biasadd_op.input[0] = get_tensor_name_from_op(conv_requantize_op.name, 0)
new_biasadd_op.input[2] = get_tensor_name_from_op(conv_requantize_op.name, 1)
new_biasadd_op.input[3] = get_tensor_name_from_op(conv_requantize_op.name, 2)
new_biasadd_op.out_max_byte_size[0] = conv_requantize_op.out_max_byte_size[0] * 4
new_biasadd_requantize_op = mace_pb2.OperatorDef()
new_biasadd_requantize_op.CopyFrom(biasadd_requantize_op)
new_biasadd_requantize_op.out_max_byte_size[0] = new_biasadd_op.out_max_byte_size[0] / 4
new_b2s_op = mace_pb2.OperatorDef()
new_b2s_op.CopyFrom(b2s_op)
new_b2s_op.input[0] = get_tensor_name_from_op(biasadd_requantize_op.name, 0)
new_b2s_op.input[3] = get_tensor_name_from_op(biasadd_requantize_op.name, 1)
new_b2s_op.input[4] = get_tensor_name_from_op(biasadd_requantize_op.name, 2)
new_ops.extend([new_biasadd_op, new_biasadd_requantize_op, new_b2s_op])
skip_ops = skip_ops.union([biasadd_op.name, biasadd_requantize_op.name, b2s_op.name])
visited_ops.add(op.name)
follow_ops = consumers[get_tensor_name_from_op(biasadd_requantize_op.name, 0)]
for follow_op in follow_ops:
new_follow_op = mace_pb2.OperatorDef()
new_follow_op.CopyFrom(follow_op)
for i in xrange(len(follow_op.input)):
for k in xrange(3):
if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k):
new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k)
new_ops.append(new_follow_op)
skip_ops.add(follow_op.name)
visited_ops.add(follow_op.name)
visited_ops.add(op.name)
new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend(tensor_map.values())
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops)
return new_net_def
tensor_map = {}
for tensor in net_def.tensors:
tensor_map[tensor.name] = tensor
op_map = {}
for op in net_def.op:
op_map[op.name] = op
consumers = {}
for op in net_def.op:
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
new_ops = []
skip_ops = set()
visited_ops = set()
for op in net_def.op:
if op.name in visited_ops:
pass
# pattern: QConv -> RR -> R -> QB2S -> QBiasAdd -> RR -> R
success = False
if op.type == 'Requantize_32to8':
biasadd_requantize_op = op
biasadd_op = get_node_from_map(op_map,
biasadd_requantize_op.input[0])
if biasadd_op.type == 'QuantizedBiasAdd_8p8to32':
b2s_op = get_node_from_map(op_map, biasadd_op.input[0])
if b2s_op.type == 'QuantizedBatchToSpaceND_8':
conv_requantize_op = get_node_from_map(
op_map, b2s_op.input[0])
conv_op = get_node_from_map(op_map,
conv_requantize_op.input[0])
if conv_op.type == 'QuantizedConv2d_8x8to32':
new_biasadd_op = mace_pb2.OperatorDef()
new_biasadd_op.CopyFrom(biasadd_op)
new_biasadd_op.input[0] = get_tensor_name_from_op(
conv_requantize_op.name, 0)
new_biasadd_op.input[2] = get_tensor_name_from_op(
conv_requantize_op.name, 1)
new_biasadd_op.input[3] = get_tensor_name_from_op(
conv_requantize_op.name, 2)
new_biasadd_op.out_max_byte_size[
0] = conv_requantize_op.out_max_byte_size[0] * 4
new_biasadd_requantize_op = mace_pb2.OperatorDef()
new_biasadd_requantize_op.CopyFrom(
biasadd_requantize_op)
new_biasadd_requantize_op.out_max_byte_size[
0] = new_biasadd_op.out_max_byte_size[0] / 4
new_b2s_op = mace_pb2.OperatorDef()
new_b2s_op.CopyFrom(b2s_op)
new_b2s_op.input[0] = get_tensor_name_from_op(
biasadd_requantize_op.name, 0)
new_b2s_op.input[3] = get_tensor_name_from_op(
biasadd_requantize_op.name, 1)
new_b2s_op.input[4] = get_tensor_name_from_op(
biasadd_requantize_op.name, 2)
new_ops.extend([
new_biasadd_op, new_biasadd_requantize_op,
new_b2s_op
])
skip_ops = skip_ops.union([
biasadd_op.name, biasadd_requantize_op.name,
b2s_op.name
])
visited_ops.add(op.name)
follow_ops = consumers[get_tensor_name_from_op(
biasadd_requantize_op.name, 0)]
for follow_op in follow_ops:
new_follow_op = mace_pb2.OperatorDef()
new_follow_op.CopyFrom(follow_op)
for i in xrange(len(follow_op.input)):
for k in xrange(3):
if new_follow_op.input[
i] == get_tensor_name_from_op(
biasadd_requantize_op.name, k):
new_follow_op.input[
i] = get_tensor_name_from_op(
b2s_op.name, k)
new_ops.append(new_follow_op)
skip_ops.add(follow_op.name)
visited_ops.add(follow_op.name)
visited_ops.add(op.name)
new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend(tensor_map.values())
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops)
return new_net_def
def add_node_id(net_def):
node_id_counter = 0
node_id_map = {}
for tensor in net_def.tensors:
tensor.node_id = node_id_counter
node_id_counter += 1
tensor_op, port = get_op_and_port_from_tensor(tensor.name)
node_id_map[tensor_op] = tensor.node_id
for op in net_def.op:
op.node_id = node_id_counter
node_id_counter += 1
node_id_map[op.name] = op.node_id
for ipt in op.input:
op_name, port = get_op_and_port_from_tensor(ipt)
node_id = node_id_map[op_name]
node_input = op.node_input.add()
node_input.node_id = node_id
node_input.output_port = int(port)
return net_def
node_id_counter = 0
node_id_map = {}
for tensor in net_def.tensors:
tensor.node_id = node_id_counter
node_id_counter += 1
tensor_op, port = get_op_and_port_from_tensor(tensor.name)
node_id_map[tensor_op] = tensor.node_id
for op in net_def.op:
op.node_id = node_id_counter
node_id_counter += 1
node_id_map[op.name] = op.node_id
for ipt in op.input:
op_name, port = get_op_and_port_from_tensor(ipt)
node_id = node_id_map[op_name]
node_input = op.node_input.add()
node_input.node_id = node_id
node_input.output_port = int(port)
return net_def
def add_input_output_info(net_def, input_node, output_node, graph, dtype):
input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0))
output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0))
input_info = net_def.input_info.add()
input_info.dims.extend(input_tensor.shape.as_list())
input_info.data_type = dtype
if dtype == mace_pb2.DT_UINT8:
for i in xrange(2):
input_info = net_def.input_info.add()
input_info.dims.extend([1,1,1,1])
input_info.data_type = mace_pb2.DT_FLOAT
output_info = net_def.output_info.add()
output_info.dims.extend(output_tensor.shape.as_list())
output_info.data_type = dtype
if dtype == mace_pb2.DT_UINT8:
for i in xrange(2):
output_info = net_def.output_info.add()
output_info.dims.extend([1,1,1,1])
output_info.data_type = mace_pb2.DT_FLOAT
return net_def
input_tensor = graph.get_tensor_by_name(
get_tensor_name_from_op(input_node, 0))
output_tensor = graph.get_tensor_by_name(
get_tensor_name_from_op(output_node, 0))
input_info = net_def.input_info.add()
input_info.dims.extend(input_tensor.shape.as_list())
input_info.data_type = dtype
if dtype == mace_pb2.DT_UINT8:
for i in xrange(2):
input_info = net_def.input_info.add()
input_info.dims.extend([1, 1, 1, 1])
input_info.data_type = mace_pb2.DT_FLOAT
output_info = net_def.output_info.add()
output_info.dims.extend(output_tensor.shape.as_list())
output_info.data_type = dtype
if dtype == mace_pb2.DT_UINT8:
for i in xrange(2):
output_info = net_def.output_info.add()
output_info.dims.extend([1, 1, 1, 1])
output_info.data_type = mace_pb2.DT_FLOAT
return net_def
def fuse_quantize(net_def, input_node, output_node):
tensor_map = {}
for tensor in net_def.tensors:
tensor_map[tensor.name] = tensor
op_map = {}
for op in net_def.op:
op_map[op.name] = op
consumers = {}
for op in net_def.op:
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
skip_ops = set()
new_ops = []
skip_tensors = set()
# INPUT->Flatten->Minf, Maxf->Quantize
for op in net_def.op:
if op.type == 'INPUT':
input_op = op
flatten_op = None
quantize_op = None
for o in consumers[get_tensor_name_from_op(input_op.name, 0)]:
if o.type == 'Flatten':
flatten_op = o
elif o.type == 'Quantize':
quantize_op = o
if quantize_op is not None:
minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)]
skip_ops = skip_ops.union([flatten_op.name, minf_op.name, maxf_op.name])
skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
quantize_op.type = 'AutoQuantize'
del quantize_op.input[1:]
new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors])
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops)
return new_net_def
tensor_map = {}
for tensor in net_def.tensors:
tensor_map[tensor.name] = tensor
op_map = {}
for op in net_def.op:
op_map[op.name] = op
consumers = {}
for op in net_def.op:
for ipt in op.input:
if ipt not in consumers:
consumers[ipt] = []
consumers[ipt].append(op)
skip_ops = set()
new_ops = []
skip_tensors = set()
# INPUT->Flatten->Minf, Maxf->Quantize
for op in net_def.op:
if op.type == 'INPUT':
input_op = op
flatten_op = None
quantize_op = None
for o in consumers[get_tensor_name_from_op(input_op.name, 0)]:
if o.type == 'Flatten':
flatten_op = o
elif o.type == 'Quantize':
quantize_op = o
if quantize_op is not None:
minf_op, maxf_op = consumers[get_tensor_name_from_op(
flatten_op.name, 0)]
skip_ops = skip_ops.union(
[flatten_op.name, minf_op.name, maxf_op.name])
skip_tensors = skip_tensors.union(
[flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
quantize_op.type = 'AutoQuantize'
del quantize_op.input[1:]
new_net_def = mace_pb2.NetDef()
new_net_def.tensors.extend([
tensor for tensor in net_def.tensors if tensor.name not in skip_tensors
])
new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
new_net_def.op.extend(new_ops)
return new_net_def
def convert_to_mace_pb(model_file, input_node, output_node, dsp_mode):
"""
"""
nnlib does not have batch norm, so use tensorflow optimizer to fold
batch norm with convolution. The fold optimization reorders ops, so
we sort ops first by topology.
"""
input_graph_def = tf.GraphDef()
with gfile.Open(model_file, "rb") as f:
data = f.read()
input_graph_def.ParseFromString(data)
input_graph_def = graph_util.sort_tf_graph(input_graph_def)
net_def = mace_pb2.NetDef()
with tf.Session() as session:
with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name="")
ops = graph.get_operations()
dsp_ops = DspOps()
resolved_ops = set()
# convert const node
unresolved_ops = [op for op in ops if op.type == 'Const']
while len(unresolved_ops) > 0:
convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops)
# convert op node
unresolved_ops = [op for op in ops if op.type != 'Const']
while len(unresolved_ops) > 0:
convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops)
add_output_node(net_def, output_node)
net_def = reverse_batch_to_space_and_biasadd(net_def)
net_def = fuse_quantize(net_def, input_node, output_node)
sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__')
net_def_with_node_id = add_node_id(sorted_net_def)
dtype = mace_pb2.DT_FLOAT
final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype)
arg = final_net_def.arg.add()
arg.name = 'dsp_mode'
arg.i = dsp_mode
return final_net_def
input_graph_def = tf.GraphDef()
with gfile.Open(model_file, "rb") as f:
data = f.read()
input_graph_def.ParseFromString(data)
input_graph_def = graph_util.sort_tf_graph(input_graph_def)
net_def = mace_pb2.NetDef()
with tf.Session() as session:
with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name="")
ops = graph.get_operations()
dsp_ops = DspOps()
resolved_ops = set()
# convert const node
unresolved_ops = [op for op in ops if op.type == 'Const']
while len(unresolved_ops) > 0:
convert_ops(unresolved_ops, resolved_ops, net_def, output_node,
dsp_ops)
# convert op node
unresolved_ops = [op for op in ops if op.type != 'Const']
while len(unresolved_ops) > 0:
convert_ops(unresolved_ops, resolved_ops, net_def, output_node,
dsp_ops)
add_output_node(net_def, output_node)
net_def = reverse_batch_to_space_and_biasadd(net_def)
net_def = fuse_quantize(net_def, input_node, output_node)
sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__')
net_def_with_node_id = add_node_id(sorted_net_def)
dtype = mace_pb2.DT_FLOAT
final_net_def = add_input_output_info(
net_def_with_node_id, input_node, output_node, graph, dtype)
arg = final_net_def.arg.add()
arg.name = 'dsp_mode'
arg.i = dsp_mode
return final_net_def
......@@ -10,148 +10,174 @@ from tensorflow import gfile
FLAGS = None
def hist_inc(hist, key):
if key in hist:
hist[key] += 1
else:
hist[key] = 1
if key in hist:
hist[key] += 1
else:
hist[key] = 1
def to_int_list(long_list):
int_list = []
for value in long_list:
int_list.append(int(value))
return int_list
int_list = []
for value in long_list:
int_list.append(int(value))
return int_list
def main(unused_args):
if not FLAGS.input or not gfile.Exists(FLAGS.input):
print('Input graph file ' + FLAGS.input + ' does not exist!')
return -1
input_graph_def = tf.GraphDef()
with gfile.Open(FLAGS.input, 'rb') as f:
data = f.read()
input_graph_def.ParseFromString(data)
with tf.Session() as session:
with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name='')
stats = {}
ops = graph.get_operations()
# extract kernel size for conv_2d
tensor_shapes = {}
tensor_values = {}
print("=========================consts============================")
for op in ops:
if op.type == 'Const':
for output in op.outputs:
tensor_name = output.name
tensor = output.eval()
tensor_shape = list(tensor.shape)
tensor_shapes[tensor_name] = tensor_shape
print("Const %s: %s, %d" % (tensor_name, tensor_shape, functools.reduce(operator.mul, tensor_shape, 1)))
if len(tensor_shape) == 1 and tensor_shape[0] < 10:
tensor_values[tensor_name] = list(tensor)
print("=========================ops============================")
for op in ops:
if op.type in ['Conv2D']:
padding = op.get_attr('padding')
strides = to_int_list(op.get_attr('strides'))
data_format = op.get_attr('data_format')
ksize = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith('weights/read:0'):
ksize = input.shape.as_list()
break
if input_name.endswith('weights:0') and input_name in tensor_shapes:
ksize = tensor_shapes[input_name]
break
print('%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s' % (op.type, padding, strides, ksize, data_format, op.inputs[0].shape, op.outputs[0].shape))
key = '%s(padding=%s, strides=%s, ksize=%s, format=%s)' % (op.type, padding, strides, ksize, data_format)
hist_inc(stats, key)
elif op.type in ['FusedResizeAndPadConv2D']:
padding = op.get_attr('padding')
strides = to_int_list(op.get_attr('strides'))
resize_align_corners = op.get_attr('resize_align_corners')
ksize = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith('weights:0') and input_name in tensor_shapes:
ksize = tensor_shapes[input_name]
break
key = '%s(padding=%s, strides=%s, ksize=%s, resize_align_corners=%s)' % (op.type, padding, strides, ksize, resize_align_corners)
hist_inc(stats, key)
elif op.type in ['ResizeBilinear']:
align_corners = op.get_attr('align_corners')
size = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith('size:0') and input_name in tensor_values:
size = tensor_values[input_name]
break
key = '%s(size=%s, align_corners=%s)' % (op.type, size, align_corners)
print(key)
hist_inc(stats, key)
elif op.type in ['AvgPool', 'MaxPool']:
padding = op.get_attr('padding')
strides = to_int_list(op.get_attr('strides'))
ksize = to_int_list(op.get_attr('ksize'))
data_format = op.get_attr('data_format')
key = '%s(padding=%s, strides=%s, ksize=%s)' % (op.type, padding, strides, ksize)
hist_inc(stats, key)
elif op.type in ['SpaceToBatchND', 'BatchToSpaceND']:
block_shape = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith('block_shape:0') and input_name in tensor_values:
block_shape = tensor_values[input_name]
break
paddings = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith('paddings:0') and input_name in tensor_values:
paddings = tensor_values[input_name]
break
crops = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith('crops:0') and input_name in tensor_values:
paddings = tensor_values[input_name]
break
if op.type == 'SpaceToBatchND':
key = '%s(block_shape=%s, paddings=%s)' % (op.type, block_shape, paddings)
else:
key = '%s(block_shape=%s, crops=%s)' % (op.type, block_shape, crops)
print(key)
hist_inc(stats, key)
elif op.type == 'Pad':
paddings = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith('paddings:0') and input_name in tensor_values:
paddings = tensor_values[input_name]
break
key = '%s(paddings=%s)' % (op.type, paddings)
hist_inc(stats, key)
else:
hist_inc(stats, op.type)
print("=========================stats============================")
for key, value in sorted(six.iteritems(stats)):
print('%s: %d' % (key, value))
if not FLAGS.input or not gfile.Exists(FLAGS.input):
print('Input graph file ' + FLAGS.input + ' does not exist!')
return -1
input_graph_def = tf.GraphDef()
with gfile.Open(FLAGS.input, 'rb') as f:
data = f.read()
input_graph_def.ParseFromString(data)
with tf.Session() as session:
with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name='')
stats = {}
ops = graph.get_operations()
# extract kernel size for conv_2d
tensor_shapes = {}
tensor_values = {}
print("=========================consts============================")
for op in ops:
if op.type == 'Const':
for output in op.outputs:
tensor_name = output.name
tensor = output.eval()
tensor_shape = list(tensor.shape)
tensor_shapes[tensor_name] = tensor_shape
print("Const %s: %s, %d" %
(tensor_name, tensor_shape,
functools.reduce(operator.mul, tensor_shape, 1)))
if len(tensor_shape) == 1 and tensor_shape[0] < 10:
tensor_values[tensor_name] = list(tensor)
print("=========================ops============================")
for op in ops:
if op.type in ['Conv2D']:
padding = op.get_attr('padding')
strides = to_int_list(op.get_attr('strides'))
data_format = op.get_attr('data_format')
ksize = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith('weights/read:0'):
ksize = input.shape.as_list()
break
if input_name.endswith(
'weights:0') and input_name in tensor_shapes:
ksize = tensor_shapes[input_name]
break
print(
'%s(padding=%s, strides=%s, ksize=%s, format=%s) %s => %s'
% (op.type, padding, strides, ksize, data_format,
op.inputs[0].shape, op.outputs[0].shape))
key = '%s(padding=%s, strides=%s, ksize=%s, format=%s)' % (
op.type, padding, strides, ksize, data_format)
hist_inc(stats, key)
elif op.type in ['FusedResizeAndPadConv2D']:
padding = op.get_attr('padding')
strides = to_int_list(op.get_attr('strides'))
resize_align_corners = op.get_attr('resize_align_corners')
ksize = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith(
'weights:0') and input_name in tensor_shapes:
ksize = tensor_shapes[input_name]
break
key = '%s(padding=%s, strides=%s, ksize=%s, ' \
'resize_align_corners=%s)' % (op.type, padding, strides,
ksize, resize_align_corners)
hist_inc(stats, key)
elif op.type in ['ResizeBilinear']:
align_corners = op.get_attr('align_corners')
size = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith(
'size:0') and input_name in tensor_values:
size = tensor_values[input_name]
break
key = '%s(size=%s, align_corners=%s)' % (op.type, size,
align_corners)
print(key)
hist_inc(stats, key)
elif op.type in ['AvgPool', 'MaxPool']:
padding = op.get_attr('padding')
strides = to_int_list(op.get_attr('strides'))
ksize = to_int_list(op.get_attr('ksize'))
data_format = op.get_attr('data_format')
key = '%s(padding=%s, strides=%s, ksize=%s)' % (op.type,
padding,
strides, ksize)
hist_inc(stats, key)
elif op.type in ['SpaceToBatchND', 'BatchToSpaceND']:
block_shape = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith(
'block_shape:0') and input_name in tensor_values:
block_shape = tensor_values[input_name]
break
paddings = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith(
'paddings:0') and input_name in tensor_values:
paddings = tensor_values[input_name]
break
crops = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith(
'crops:0') and input_name in tensor_values:
paddings = tensor_values[input_name]
break
if op.type == 'SpaceToBatchND':
key = '%s(block_shape=%s, paddings=%s)' % (op.type,
block_shape,
paddings)
else:
key = '%s(block_shape=%s, crops=%s)' % (op.type,
block_shape, crops)
print(key)
hist_inc(stats, key)
elif op.type == 'Pad':
paddings = 'Unknown'
for input in op.inputs:
input_name = input.name
if input_name.endswith(
'paddings:0') and input_name in tensor_values:
paddings = tensor_values[input_name]
break
key = '%s(paddings=%s)' % (op.type, paddings)
hist_inc(stats, key)
else:
hist_inc(stats, op.type)
print("=========================stats============================")
for key, value in sorted(six.iteritems(stats)):
print('%s: %d' % (key, value))
def parse_args():
'''Parses command line arguments.'''
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
type=str,
default='',
help='TensorFlow \'GraphDef\' file to load.')
return parser.parse_known_args()
'''Parses command line arguments.'''
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
type=str,
default='',
help='TensorFlow \'GraphDef\' file to load.')
return parser.parse_known_args()
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
......@@ -7,7 +7,6 @@
# --target=//mace/ops:ops_test
# --stdout_processor=stdout_processor
import argparse
import random
import re
......@@ -15,104 +14,113 @@ import sys
import sh_commands
def stdout_processor(stdout, device_properties, abi):
pass
pass
def ops_test_stdout_processor(stdout, device_properties, abi):
stdout_lines = stdout.split("\n")
for line in stdout_lines:
if "Aborted" in line or "FAILED" in line:
raise Exception("Command failed")
stdout_lines = stdout.split("\n")
for line in stdout_lines:
if "Aborted" in line or "FAILED" in line:
raise Exception("Command failed")
def ops_benchmark_stdout_processor(stdout, device_properties, abi):
stdout_lines = stdout.split("\n")
metrics = {}
for line in stdout_lines:
if "Aborted" in line:
raise Exception("Command failed")
line = line.strip()
parts = line.split()
if len(parts) == 5 and parts[0].startswith("BM_"):
metrics["%s.time_ms" % parts[0]] = str(float(parts[1])/1e6)
metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
metrics["%s.gmacc_per_sec" % parts[0]] = parts[4]
platform = device_properties["ro.board.platform"].replace(" ", "-")
model = device_properties["ro.product.model"].replace(" ", "-")
tags = {"ro.board.platform": platform,
"ro.product.model": model,
"abi": abi}
sh_commands.falcon_push_metrics(metrics, tags=tags,
endpoint="mace_ops_benchmark")
stdout_lines = stdout.split("\n")
metrics = {}
for line in stdout_lines:
if "Aborted" in line:
raise Exception("Command failed")
line = line.strip()
parts = line.split()
if len(parts) == 5 and parts[0].startswith("BM_"):
metrics["%s.time_ms" % parts[0]] = str(float(parts[1]) / 1e6)
metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
metrics["%s.gmacc_per_sec" % parts[0]] = parts[4]
platform = device_properties["ro.board.platform"].replace(" ", "-")
model = device_properties["ro.product.model"].replace(" ", "-")
tags = {
"ro.board.platform": platform,
"ro.product.model": model,
"abi": abi
}
sh_commands.falcon_push_metrics(
metrics, tags=tags, endpoint="mace_ops_benchmark")
def parse_args():
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--target_abis",
type=str,
default="armeabi-v7a",
help="Target ABIs, comma seperated list")
parser.add_argument(
"--target_socs",
type=str,
default="all",
help="SoCs(ro.board.platform) to build, comma seperated list or all/random")
parser.add_argument(
"--target",
type=str,
default="//...",
help="Bazel target to build")
parser.add_argument(
"--run_target",
type=bool,
default=False,
help="Whether to run the target")
parser.add_argument(
"--args",
type=str,
default="",
help="Command args")
parser.add_argument(
"--stdout_processor",
type=str,
default="stdout_processor",
help="Stdout processing function, default: stdout_processor")
return parser.parse_known_args()
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--target_abis",
type=str,
default="armeabi-v7a",
help="Target ABIs, comma seperated list")
parser.add_argument(
"--target_socs",
type=str,
default="all",
help="SoCs (ro.board.platform from getprop) to build, "
"comma seperated list or all/random")
parser.add_argument(
"--target", type=str, default="//...", help="Bazel target to build")
parser.add_argument(
"--run_target",
type=bool,
default=False,
help="Whether to run the target")
parser.add_argument("--args", type=str, default="", help="Command args")
parser.add_argument(
"--stdout_processor",
type=str,
default="stdout_processor",
help="Stdout processing function, default: stdout_processor")
return parser.parse_known_args()
def main(unused_args):
target_socs = None
if FLAGS.target_socs != "all" and FLAGS.target_socs != "random":
target_socs = set(FLAGS.target_socs.split(','))
target_devices = sh_commands.adb_devices(target_socs=target_socs)
if FLAGS.target_socs == "random":
target_devices = [random.choice(target_devices)]
target = FLAGS.target
host_bin_path, bin_name = sh_commands.bazel_target_to_bin(target)
target_abis = FLAGS.target_abis.split(',')
# generate sources
sh_commands.gen_encrypted_opencl_source()
sh_commands.gen_compiled_opencl_source()
sh_commands.gen_mace_version()
for target_abi in target_abis:
sh_commands.bazel_build(target, abi=target_abi)
if FLAGS.run_target:
for serialno in target_devices:
if target_abi not in set(sh_commands.adb_supported_abis(serialno)):
print("Skip device %s which does not support ABI %s" % (serialno, target_abi))
continue
stdouts = sh_commands.adb_run(serialno, host_bin_path, bin_name,
args=FLAGS.args,
opencl_profiling=1,
vlog_level=0,
device_bin_path="/data/local/tmp/mace",
out_of_range_check=1)
device_properties = sh_commands.adb_getprop_by_serialno(serialno)
globals()[FLAGS.stdout_processor](stdouts, device_properties, target_abi)
target_socs = None
if FLAGS.target_socs != "all" and FLAGS.target_socs != "random":
target_socs = set(FLAGS.target_socs.split(','))
target_devices = sh_commands.adb_devices(target_socs=target_socs)
if FLAGS.target_socs == "random":
target_devices = [random.choice(target_devices)]
target = FLAGS.target
host_bin_path, bin_name = sh_commands.bazel_target_to_bin(target)
target_abis = FLAGS.target_abis.split(',')
# generate sources
sh_commands.gen_encrypted_opencl_source()
sh_commands.gen_compiled_opencl_source()
sh_commands.gen_mace_version()
for target_abi in target_abis:
sh_commands.bazel_build(target, abi=target_abi)
if FLAGS.run_target:
for serialno in target_devices:
if target_abi not in set(
sh_commands.adb_supported_abis(serialno)):
print("Skip device %s which does not support ABI %s" %
(serialno, target_abi))
continue
stdouts = sh_commands.adb_run(
serialno,
host_bin_path,
bin_name,
args=FLAGS.args,
opencl_profiling=1,
vlog_level=0,
device_bin_path="/data/local/tmp/mace",
out_of_range_check=1)
device_properties = sh_commands.adb_getprop_by_serialno(
serialno)
globals()[FLAGS.stdout_processor](stdouts, device_properties,
target_abi)
if __name__ == "__main__":
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
#-*- coding:utf8 -*-
import json
import socket
import itertools
import json, socket, itertools
class FalconCli(object):
def __init__(self, addr, debug=True, buf_size=1000):
self.socket_ = socket.create_connection(addr)
self.stream = self.socket_.makefile()
......@@ -16,16 +16,19 @@ class FalconCli(object):
self.stream.close()
@classmethod
def connect(cls, server="transfer.falcon.miliao.srv", port=8433, debug=True, buf_size=1000):
def connect(cls,
server="transfer.falcon.miliao.srv",
port=8433,
debug=True,
buf_size=1000):
try:
return FalconCli((server, port), debug, buf_size)
except socket.error, exc:
print "error: connect to %s:%s error: %s" %(server, port, exc)
print "error: connect to %s:%s error: %s" % (server, port, exc)
def call(self, name, *params):
request = dict(id=next(self.id_counter),
params=list(params),
method=name)
request = dict(
id=next(self.id_counter), params=list(params), method=name)
payload = json.dumps(request).encode()
if self.debug:
print "--> req:", payload
......@@ -49,7 +52,7 @@ class FalconCli(object):
resp = []
while True:
buf = lines[s:s+self.buf_size]
buf = lines[s:s + self.buf_size]
s = s + self.buf_size
if len(buf) == 0:
break
......@@ -57,4 +60,3 @@ class FalconCli(object):
resp.append(r)
return resp
......@@ -11,45 +11,40 @@ import re
# --input_file input_file
#
def generate_data(name, shape):
np.random.seed()
data = np.random.random(shape) * 2 - 1
input_file_name = FLAGS.input_file + "_" + re.sub('[^0-9a-zA-Z]+', '_', name)
print 'Generate input file: ', input_file_name
data.astype(np.float32).tofile(input_file_name)
np.random.seed()
data = np.random.random(shape) * 2 - 1
input_file_name = FLAGS.input_file + "_" + re.sub('[^0-9a-zA-Z]+', '_',
name)
print 'Generate input file: ', input_file_name
data.astype(np.float32).tofile(input_file_name)
def main(unused_args):
input_names = [name for name in FLAGS.input_node.split(',')]
input_shapes = [shape for shape in FLAGS.input_shape.split(':')]
assert len(input_names) == len(input_shapes)
for i in range(len(input_names)):
shape = [int(x) for x in input_shapes[i].split(',')]
generate_data(input_names[i], shape)
print "Generate input file done."
input_names = [name for name in FLAGS.input_node.split(',')]
input_shapes = [shape for shape in FLAGS.input_shape.split(':')]
assert len(input_names) == len(input_shapes)
for i in range(len(input_names)):
shape = [int(x) for x in input_shapes[i].split(',')]
generate_data(input_names[i], shape)
print "Generate input file done."
def parse_args():
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--input_file",
type=str,
default="",
help="input file.")
parser.add_argument(
"--input_node",
type=str,
default="input_node",
help="input node")
parser.add_argument(
"--input_shape",
type=str,
default="1,64,64,3",
help="input shape.")
return parser.parse_known_args()
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--input_file", type=str, default="", help="input file.")
parser.add_argument(
"--input_node", type=str, default="input_node", help="input node")
parser.add_argument(
"--input_shape", type=str, default="1,64,64,3", help="input shape.")
return parser.parse_known_args()
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
......@@ -23,124 +23,135 @@ from ConfigParser import ConfigParser
def run_command(command):
print("Run command: {}".format(command))
result = subprocess.Popen(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = result.communicate()
print("Run command: {}".format(command))
result = subprocess.Popen(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = result.communicate()
if out:
print("Stdout msg:\n{}".format(out))
if err:
print("Stderr msg:\n{}".format(err))
if out:
print("Stdout msg:\n{}".format(out))
if err:
print("Stderr msg:\n{}".format(err))
if result.returncode != 0:
raise Exception("Exit not 0 from bash with code: {}, command: {}".format(
result.returncode, command))
if result.returncode != 0:
raise Exception(
"Exit not 0 from bash with code: {}, command: {}".format(
result.returncode, command))
def get_global_runtime(configs):
runtime_list = []
for model_name in configs["models"]:
model_runtime = configs["models"][model_name]["runtime"]
runtime_list.append(model_runtime.lower())
global_runtime = ""
if "dsp" in runtime_list:
global_runtime = "dsp"
elif "gpu" in runtime_list:
global_runtime = "gpu"
elif "cpu" in runtime_list:
global_runtime = "cpu"
elif "neon" in runtime_list:
global_runtime = "neon"
else:
raise Exception("Not found available RUNTIME in config files!")
return global_runtime
runtime_list = []
for model_name in configs["models"]:
model_runtime = configs["models"][model_name]["runtime"]
runtime_list.append(model_runtime.lower())
global_runtime = ""
if "dsp" in runtime_list:
global_runtime = "dsp"
elif "gpu" in runtime_list:
global_runtime = "gpu"
elif "cpu" in runtime_list:
global_runtime = "cpu"
elif "neon" in runtime_list:
global_runtime = "neon"
else:
raise Exception("Not found available RUNTIME in config files!")
return global_runtime
def generate_version_code():
command = "bash tools/generate_version_code.sh"
run_command(command)
command = "bash tools/generate_version_code.sh"
run_command(command)
def generate_opencl_source_code():
command = "bash tools/generate_opencl_code.sh source"
run_command(command)
command = "bash tools/generate_opencl_code.sh source"
run_command(command)
def generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not):
cl_bin_dirs = []
for d in model_output_dirs:
cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
cl_bin_dirs_str = ",".join(cl_bin_dirs)
if not cl_bin_dirs:
command = "bash tools/generate_opencl_code.sh binary"
else:
command = "bash tools/generate_opencl_code.sh {} {} {} {}".format(
'binary', target_soc, cl_bin_dirs_str, int(pull_or_not))
run_command(command)
cl_bin_dirs = []
for d in model_output_dirs:
cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
cl_bin_dirs_str = ",".join(cl_bin_dirs)
if not cl_bin_dirs:
command = "bash tools/generate_opencl_code.sh binary"
else:
command = "bash tools/generate_opencl_code.sh {} {} {} {}".format(
'binary', target_soc, cl_bin_dirs_str, int(pull_or_not))
run_command(command)
def generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not):
cl_bin_dirs = []
for d in model_output_dirs:
cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
cl_bin_dirs_str = ",".join(cl_bin_dirs)
if not cl_bin_dirs:
command = "bash tools/generate_tuning_param_code.sh"
else:
command = "bash tools/generate_tuning_param_code.sh {} {} {}".format(
target_soc, cl_bin_dirs_str, int(pull_or_not))
run_command(command)
cl_bin_dirs = []
for d in model_output_dirs:
cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
cl_bin_dirs_str = ",".join(cl_bin_dirs)
if not cl_bin_dirs:
command = "bash tools/generate_tuning_param_code.sh"
else:
command = "bash tools/generate_tuning_param_code.sh {} {} {}".format(
target_soc, cl_bin_dirs_str, int(pull_or_not))
run_command(command)
def generate_code(target_soc, model_output_dirs, pull_or_not):
generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not)
generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not)
generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not)
generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not)
def clear_env(target_soc):
command = "bash tools/clear_env.sh {}".format(target_soc)
run_command(command)
command = "bash tools/clear_env.sh {}".format(target_soc)
run_command(command)
def input_file_name(input_name):
return os.environ['INPUT_FILE_NAME'] + '_' + \
re.sub('[^0-9a-zA-Z]+', '_', input_name)
def generate_random_input(target_soc, model_output_dir,
input_names, input_files):
generate_data_or_not = True
command = "bash tools/validate_tools.sh {} {} {}".format(
target_soc, model_output_dir, int(generate_data_or_not))
run_command(command)
input_file_list = []
if isinstance(input_files, list):
input_file_list.extend(input_files)
else:
input_file_list.append(input_files)
if len(input_file_list) != 0:
input_name_list = []
if isinstance(input_names, list):
input_name_list.extend(input_names)
return os.environ['INPUT_FILE_NAME'] + '_' + \
re.sub('[^0-9a-zA-Z]+', '_', input_name)
def generate_random_input(target_soc, model_output_dir, input_names,
input_files):
generate_data_or_not = True
command = "bash tools/validate_tools.sh {} {} {}".format(
target_soc, model_output_dir, int(generate_data_or_not))
run_command(command)
input_file_list = []
if isinstance(input_files, list):
input_file_list.extend(input_files)
else:
input_name_list.append(input_names)
if len(input_file_list) != len(input_name_list):
raise Exception('If input_files set, the input files should match the input names.')
for i in range(len(input_file_list)):
if input_file_list[i] is not None:
dst_input_file = model_output_dir + '/' + input_file_name(input_name_list[i])
if input_file_list[i].startswith("http://") or \
input_file_list[i].startswith("https://"):
urllib.urlretrieve(input_file_list[i], dst_input_file)
input_file_list.append(input_files)
if len(input_file_list) != 0:
input_name_list = []
if isinstance(input_names, list):
input_name_list.extend(input_names)
else:
shutil.copy(input_file_list[i], dst_input_file)
input_name_list.append(input_names)
if len(input_file_list) != len(input_name_list):
raise Exception('If input_files set, the input files should '
'match the input names.')
for i in range(len(input_file_list)):
if input_file_list[i] is not None:
dst_input_file = model_output_dir + '/' + input_file_name(
input_name_list[i])
if input_file_list[i].startswith("http://") or \
input_file_list[i].startswith("https://"):
urllib.urlretrieve(input_file_list[i], dst_input_file)
else:
shutil.copy(input_file_list[i], dst_input_file)
def generate_model_code():
command = "bash tools/generate_model_code.sh"
run_command(command)
command = "bash tools/generate_model_code.sh"
run_command(command)
def build_mace_run(production_mode, model_output_dir, hexagon_mode):
command = "bash tools/build_mace_run.sh {} {} {}".format(
int(production_mode), model_output_dir, int(hexagon_mode))
run_command(command)
command = "bash tools/build_mace_run.sh {} {} {}".format(
int(production_mode), model_output_dir, int(hexagon_mode))
run_command(command)
def tuning_run(model_name,
......@@ -152,301 +163,328 @@ def tuning_run(model_name,
tuning,
restart_round,
option_args=''):
# TODO(yejianwu) refactoring the hackish code
stdout_buff = []
process_output = sh_commands.make_output_processor(stdout_buff)
p = sh.bash("tools/tuning_run.sh", target_soc, model_output_dir,
running_round, int(tuning),
restart_round, option_args, _out=process_output,
_bg=True, _err_to_out=True)
p.wait()
metrics = {}
for line in stdout_buff:
line = line.strip()
parts = line.split()
if len(parts) == 6 and parts[0].startswith("time"):
metrics["%s.create_net_ms" % model_name] = str(float(parts[1]))
metrics["%s.mace_engine_ctor_ms" % model_name] = str(float(parts[2]))
metrics["%s.init_ms" % model_name] = str(float(parts[3]))
metrics["%s.warmup_ms" % model_name] = str(float(parts[4]))
if float(parts[5]) > 0:
metrics["%s.avg_latency_ms" % model_name] = str(float(parts[5]))
tags = {"ro.board.platform": target_soc,
"abi": target_abi,
# "runtime": target_runtime, # TODO(yejianwu) Add the actual runtime
"round": running_round, # TODO(yejianwu) change this to source/binary
"tuning": tuning}
sh_commands.falcon_push_metrics(metrics, endpoint="mace_model_benchmark",
tags=tags)
# TODO(yejianwu) refactoring the hackish code
stdout_buff = []
process_output = sh_commands.make_output_processor(stdout_buff)
p = sh.bash(
"tools/tuning_run.sh",
target_soc,
model_output_dir,
running_round,
int(tuning),
restart_round,
option_args,
_out=process_output,
_bg=True,
_err_to_out=True)
p.wait()
metrics = {}
for line in stdout_buff:
line = line.strip()
parts = line.split()
if len(parts) == 6 and parts[0].startswith("time"):
metrics["%s.create_net_ms" % model_name] = str(float(parts[1]))
metrics["%s.mace_engine_ctor_ms" % model_name] = str(
float(parts[2]))
metrics["%s.init_ms" % model_name] = str(float(parts[3]))
metrics["%s.warmup_ms" % model_name] = str(float(parts[4]))
if float(parts[5]) > 0:
metrics["%s.avg_latency_ms" % model_name] = str(
float(parts[5]))
tags = {
"ro.board.platform": target_soc,
"abi": target_abi,
# "runtime": target_runtime, # TODO(yejianwu) Add the actual runtime
"round": running_round, # TODO(yejianwu) change this to source/binary
"tuning": tuning
}
sh_commands.falcon_push_metrics(
metrics, endpoint="mace_model_benchmark", tags=tags)
def benchmark_model(target_soc, model_output_dir, option_args=''):
command = "bash tools/benchmark.sh {} {} \"{}\"".format(
target_soc, model_output_dir, option_args)
run_command(command)
command = "bash tools/benchmark.sh {} {} \"{}\"".format(
target_soc, model_output_dir, option_args)
run_command(command)
def run_model(model_name, target_runtime, target_abi, target_soc,
model_output_dir, running_round, restart_round, option_args):
tuning_run(model_name, target_runtime, target_abi, target_soc,
model_output_dir, running_round, False,
restart_round, option_args)
tuning_run(model_name, target_runtime, target_abi, target_soc,
model_output_dir, running_round, False, restart_round,
option_args)
def generate_production_code(target_soc, model_output_dirs, pull_or_not):
cl_bin_dirs = []
for d in model_output_dirs:
cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
cl_bin_dirs_str = ",".join(cl_bin_dirs)
command = "bash tools/generate_production_code.sh {} {} {}".format(
target_soc, cl_bin_dirs_str, int(pull_or_not))
run_command(command)
cl_bin_dirs = []
for d in model_output_dirs:
cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
cl_bin_dirs_str = ",".join(cl_bin_dirs)
command = "bash tools/generate_production_code.sh {} {} {}".format(
target_soc, cl_bin_dirs_str, int(pull_or_not))
run_command(command)
def build_mace_run_prod(model_name, target_runtime, target_abi, target_soc,
model_output_dir, tuning):
if "dsp" == target_runtime:
hexagon_mode = True
else:
hexagon_mode = False
generate_code(target_soc, [], False)
production_or_not = False
build_mace_run(production_or_not, model_output_dir, hexagon_mode)
tuning_run(
model_name,
target_runtime,
target_abi,
target_soc,
model_output_dir,
running_round=0,
tuning=tuning,
restart_round=1)
generate_code(target_soc, [model_output_dir], True)
production_or_not = True
build_mace_run(production_or_not, model_output_dir, hexagon_mode)
if "dsp" == target_runtime:
hexagon_mode = True
else:
hexagon_mode = False
generate_code(target_soc, [], False)
production_or_not = False
build_mace_run(production_or_not, model_output_dir, hexagon_mode)
tuning_run(
model_name,
target_runtime,
target_abi,
target_soc,
model_output_dir,
running_round=0,
tuning=tuning,
restart_round=1)
generate_code(target_soc, [model_output_dir], True)
production_or_not = True
build_mace_run(production_or_not, model_output_dir, hexagon_mode)
def build_run_throughput_test(target_soc, run_seconds, merged_lib_file,
model_input_dir):
command = "bash tools/build_run_throughput_test.sh {} {} {} {}".format(
target_soc, run_seconds, merged_lib_file, model_input_dir)
run_command(command)
command = "bash tools/build_run_throughput_test.sh {} {} {} {}".format(
target_soc, run_seconds, merged_lib_file, model_input_dir)
run_command(command)
def validate_model(target_soc, model_output_dir):
generate_data_or_not = False
command = "bash tools/validate_tools.sh {} {} {}".format(
target_soc, model_output_dir, int(generate_data_or_not))
run_command(command)
generate_data_or_not = False
command = "bash tools/validate_tools.sh {} {} {}".format(
target_soc, model_output_dir, int(generate_data_or_not))
run_command(command)
def build_production_code():
command = "bash tools/build_production_code.sh"
run_command(command)
command = "bash tools/build_production_code.sh"
run_command(command)
def merge_libs_and_tuning_results(target_soc, output_dir, model_output_dirs):
generate_code(target_soc, model_output_dirs, False)
build_production_code()
generate_code(target_soc, model_output_dirs, False)
build_production_code()
model_output_dirs_str = ",".join(model_output_dirs)
command = "bash tools/merge_libs.sh {} {} {}".format(target_soc, output_dir,
model_output_dirs_str)
run_command(command)
model_output_dirs_str = ",".join(model_output_dirs)
command = "bash tools/merge_libs.sh {} {} {}".format(
target_soc, output_dir, model_output_dirs_str)
run_command(command)
def packaging_lib_file(output_dir):
command = "bash tools/packaging_lib.sh {}".format(output_dir)
run_command(command)
command = "bash tools/packaging_lib.sh {}".format(output_dir)
run_command(command)
def download_model_files(model_file_path,
model_output_dir,
weight_file_path=""):
if model_file_path.startswith("http://") or \
model_file_path.startswith("https://"):
os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb"
urllib.urlretrieve(model_file_path, os.environ["MODEL_FILE_PATH"])
if weight_file_path.startswith("http://") or \
weight_file_path.startswith("https://"):
os.environ[
"WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel"
urllib.urlretrieve(weight_file_path,
os.environ["WEIGHT_FILE_PATH"])
if model_file_path.startswith("http://") or \
model_file_path.startswith("https://"):
os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb"
urllib.urlretrieve(model_file_path, os.environ["MODEL_FILE_PATH"])
if weight_file_path.startswith("http://") or \
weight_file_path.startswith("https://"):
os.environ["WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel"
urllib.urlretrieve(weight_file_path, os.environ["WEIGHT_FILE_PATH"])
def md5sum(str):
md5 = hashlib.md5()
md5.update(str)
return md5.hexdigest()
md5 = hashlib.md5()
md5.update(str)
return md5.hexdigest()
def parse_model_configs():
with open(FLAGS.config) as f:
configs = yaml.load(f)
return configs
with open(FLAGS.config) as f:
configs = yaml.load(f)
return configs
def parse_args():
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--config",
type=str,
default="./tool/config",
help="The global config file of models.")
parser.add_argument(
"--output_dir", type=str, default="build", help="The output dir.")
parser.add_argument(
"--round", type=int, default=1, help="The model running round.")
parser.add_argument(
"--run_seconds",
type=int,
default=10,
help="The model throughput test running seconds.")
parser.add_argument(
"--restart_round", type=int, default=1, help="The model restart round.")
parser.add_argument(
"--tuning", type="bool", default="true", help="Tune opencl params.")
parser.add_argument(
"--mode",
type=str,
default="all",
help="[build|run|validate|merge|all|throughput_test].")
parser.add_argument(
"--target_socs",
type=str,
default="all",
help="SoCs to build, comma seperated list (getprop ro.board.platform)")
return parser.parse_known_args()
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--config",
type=str,
default="./tool/config",
help="The global config file of models.")
parser.add_argument(
"--output_dir", type=str, default="build", help="The output dir.")
parser.add_argument(
"--round", type=int, default=1, help="The model running round.")
parser.add_argument(
"--run_seconds",
type=int,
default=10,
help="The model throughput test running seconds.")
parser.add_argument(
"--restart_round",
type=int,
default=1,
help="The model restart round.")
parser.add_argument(
"--tuning", type="bool", default="true", help="Tune opencl params.")
parser.add_argument(
"--mode",
type=str,
default="all",
help="[build|run|validate|merge|all|throughput_test].")
parser.add_argument(
"--target_socs",
type=str,
default="all",
help="SoCs to build, comma seperated list (getprop ro.board.platform)")
return parser.parse_known_args()
def set_environment(configs):
os.environ["EMBED_MODEL_DATA"] = str(configs["embed_model_data"])
os.environ["VLOG_LEVEL"] = str(configs["vlog_level"])
os.environ["PROJECT_NAME"] = os.path.splitext(os.path.basename(
FLAGS.config))[0]
os.environ['INPUT_FILE_NAME'] = "model_input"
os.environ['OUTPUT_FILE_NAME'] = "model_out"
os.environ["EMBED_MODEL_DATA"] = str(configs["embed_model_data"])
os.environ["VLOG_LEVEL"] = str(configs["vlog_level"])
os.environ["PROJECT_NAME"] = os.path.splitext(
os.path.basename(FLAGS.config))[0]
os.environ['INPUT_FILE_NAME'] = "model_input"
os.environ['OUTPUT_FILE_NAME'] = "model_out"
def main(unused_args):
configs = parse_model_configs()
if FLAGS.mode == "validate":
FLAGS.round = 1
FLAGS.restart_round = 1
set_environment(configs)
if FLAGS.mode == "build" or FLAGS.mode == "all":
# Remove previous output dirs
if not os.path.exists(FLAGS.output_dir):
os.makedirs(FLAGS.output_dir)
elif os.path.exists(os.path.join(FLAGS.output_dir, "libmace")):
shutil.rmtree(os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"]))
os.makedirs(os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"]))
generate_version_code()
generate_opencl_source_code()
option_args = ' '.join([arg for arg in unused_args if arg.startswith('--')])
available_socs = sh_commands.adb_get_all_socs()
target_socs = available_socs
if hasattr(configs, "target_socs"):
target_socs = set(configs["target_socs"])
target_socs = target_socs & available_socs
if FLAGS.target_socs != "all":
socs = set(FLAGS.target_socs.split(','))
target_socs = target_socs & socs
missing_socs = socs.difference(target_socs)
if len(missing_socs) > 0:
print("Error: devices with SoCs are not connected %s" % missing_socs)
exit(1)
for target_soc in target_socs:
for target_abi in configs["target_abis"]:
global_runtime = get_global_runtime(configs)
# Transfer params by environment
os.environ["TARGET_ABI"] = target_abi
model_output_dirs = []
for model_name in configs["models"]:
print '=======================', model_name, '======================='
# Transfer params by environment
os.environ["MODEL_TAG"] = model_name
model_config = configs["models"][model_name]
input_file_list = model_config.get("validation_inputs_data", [])
for key in model_config:
if key in ['input_nodes', 'output_nodes'] and isinstance(
model_config[key], list):
os.environ[key.upper()] = ",".join(model_config[key])
elif key in ['input_shapes', 'output_shapes'] and isinstance(
model_config[key], list):
os.environ[key.upper()] = ":".join(model_config[key])
else:
os.environ[key.upper()] = str(model_config[key])
# Create model build directory
model_path_digest = md5sum(model_config["model_file_path"])
model_output_dir = "%s/%s/%s/%s/%s/%s/%s" % (FLAGS.output_dir,
os.environ["PROJECT_NAME"],
"build", model_name,
model_path_digest,
target_soc, target_abi)
model_output_dirs.append(model_output_dir)
if FLAGS.mode == "build" or FLAGS.mode == "all":
if os.path.exists(model_output_dir):
shutil.rmtree(model_output_dir)
os.makedirs(model_output_dir)
clear_env(target_soc)
download_model_files(model_config["model_file_path"],
model_output_dir, model_config.get("weight_file_path", ""))
if FLAGS.mode == "build" or FLAGS.mode == "run" or FLAGS.mode == "validate"\
or FLAGS.mode == "benchmark" or FLAGS.mode == "all":
generate_random_input(target_soc, model_output_dir,
model_config['input_nodes'], input_file_list)
if FLAGS.mode == "build" or FLAGS.mode == "all":
generate_model_code()
build_mace_run_prod(model_name, global_runtime, target_abi,
target_soc, model_output_dir, FLAGS.tuning)
if FLAGS.mode == "run" or FLAGS.mode == "validate" or FLAGS.mode == "all":
run_model(model_name, global_runtime, target_abi, target_soc,
model_output_dir, FLAGS.round, FLAGS.restart_round,
option_args)
if FLAGS.mode == "benchmark":
benchmark_model(target_soc, model_output_dir, option_args)
if FLAGS.mode == "validate" or FLAGS.mode == "all":
validate_model(target_soc, model_output_dir)
if FLAGS.mode == "build" or FLAGS.mode == "merge" or FLAGS.mode == "all":
merge_libs_and_tuning_results(
target_soc, FLAGS.output_dir + "/" + os.environ["PROJECT_NAME"],
model_output_dirs)
if FLAGS.mode == "throughput_test":
merged_lib_file = FLAGS.output_dir + "/%s/%s/libmace_%s.%s.a" % \
(os.environ["PROJECT_NAME"], target_abi, os.environ["PROJECT_NAME"], target_soc)
generate_random_input(target_soc, FLAGS.output_dir, [], [])
for model_name in configs["models"]:
runtime = configs["models"][model_name]["runtime"]
os.environ["%s_MODEL_TAG" % runtime.upper()] = model_name
build_run_throughput_test(target_soc, FLAGS.run_seconds,
merged_lib_file, FLAGS.output_dir)
if FLAGS.mode == "build" or FLAGS.mode == "all":
packaging_lib_file(FLAGS.output_dir)
configs = parse_model_configs()
if FLAGS.mode == "validate":
FLAGS.round = 1
FLAGS.restart_round = 1
set_environment(configs)
if FLAGS.mode == "build" or FLAGS.mode == "all":
# Remove previous output dirs
if not os.path.exists(FLAGS.output_dir):
os.makedirs(FLAGS.output_dir)
elif os.path.exists(os.path.join(FLAGS.output_dir, "libmace")):
shutil.rmtree(
os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"]))
os.makedirs(
os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"]))
generate_version_code()
generate_opencl_source_code()
option_args = ' '.join(
[arg for arg in unused_args if arg.startswith('--')])
available_socs = sh_commands.adb_get_all_socs()
target_socs = available_socs
if hasattr(configs, "target_socs"):
target_socs = set(configs["target_socs"])
target_socs = target_socs & available_socs
if FLAGS.target_socs != "all":
socs = set(FLAGS.target_socs.split(','))
target_socs = target_socs & socs
missing_socs = socs.difference(target_socs)
if len(missing_socs) > 0:
print(
"Error: devices with SoCs are not connected %s" % missing_socs)
exit(1)
for target_soc in target_socs:
for target_abi in configs["target_abis"]:
global_runtime = get_global_runtime(configs)
# Transfer params by environment
os.environ["TARGET_ABI"] = target_abi
model_output_dirs = []
for model_name in configs["models"]:
print '===================', model_name, '==================='
# Transfer params by environment
os.environ["MODEL_TAG"] = model_name
model_config = configs["models"][model_name]
input_file_list = model_config.get("validation_inputs_data",
[])
for key in model_config:
if key in ['input_nodes', 'output_nodes'] and isinstance(
model_config[key], list):
os.environ[key.upper()] = ",".join(model_config[key])
elif key in ['input_shapes', 'output_shapes'
] and isinstance(model_config[key], list):
os.environ[key.upper()] = ":".join(model_config[key])
else:
os.environ[key.upper()] = str(model_config[key])
# Create model build directory
model_path_digest = md5sum(model_config["model_file_path"])
model_output_dir = "%s/%s/%s/%s/%s/%s/%s" % (
FLAGS.output_dir, os.environ["PROJECT_NAME"], "build",
model_name, model_path_digest, target_soc, target_abi)
model_output_dirs.append(model_output_dir)
if FLAGS.mode == "build" or FLAGS.mode == "all":
if os.path.exists(model_output_dir):
shutil.rmtree(model_output_dir)
os.makedirs(model_output_dir)
clear_env(target_soc)
download_model_files(model_config["model_file_path"],
model_output_dir,
model_config.get("weight_file_path", ""))
if FLAGS.mode == "build" or FLAGS.mode == "run" or \
FLAGS.mode == "validate" or \
FLAGS.mode == "benchmark" or FLAGS.mode == "all":
generate_random_input(target_soc, model_output_dir,
model_config['input_nodes'],
input_file_list)
if FLAGS.mode == "build" or FLAGS.mode == "all":
generate_model_code()
build_mace_run_prod(model_name, global_runtime, target_abi,
target_soc, model_output_dir,
FLAGS.tuning)
if FLAGS.mode == "run" or FLAGS.mode == "validate" or \
FLAGS.mode == "all":
run_model(model_name, global_runtime, target_abi,
target_soc, model_output_dir, FLAGS.round,
FLAGS.restart_round, option_args)
if FLAGS.mode == "benchmark":
benchmark_model(target_soc, model_output_dir, option_args)
if FLAGS.mode == "validate" or FLAGS.mode == "all":
validate_model(target_soc, model_output_dir)
if FLAGS.mode == "build" or FLAGS.mode == "merge" or \
FLAGS.mode == "all":
merge_libs_and_tuning_results(
target_soc,
FLAGS.output_dir + "/" + os.environ["PROJECT_NAME"],
model_output_dirs)
if FLAGS.mode == "throughput_test":
merged_lib_file = FLAGS.output_dir + \
"/%s/%s/libmace_%s.%s.a" % \
(os.environ["PROJECT_NAME"], target_abi,
os.environ["PROJECT_NAME"], target_soc)
generate_random_input(target_soc, FLAGS.output_dir, [], [])
for model_name in configs["models"]:
runtime = configs["models"][model_name]["runtime"]
os.environ["%s_MODEL_TAG" % runtime.upper()] = model_name
build_run_throughput_test(target_soc, FLAGS.run_seconds,
merged_lib_file, FLAGS.output_dir)
if FLAGS.mode == "build" or FLAGS.mode == "all":
packaging_lib_file(FLAGS.output_dir)
if __name__ == "__main__":
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
......@@ -3,172 +3,205 @@ import re
import time
import falcon_cli
################################
# common
################################
def strip_invalid_utf8(str):
return sh.iconv(str, "-c", "-t", "UTF-8")
return sh.iconv(str, "-c", "-t", "UTF-8")
def make_output_processor(buff):
def process_output(line):
print(line.strip())
buff.append(line)
return process_output
def process_output(line):
print(line.strip())
buff.append(line)
return process_output
################################
# adb commands
################################
def adb_split_stdout(stdout_str):
stdout_str = strip_invalid_utf8(stdout_str)
# Filter out last empty line
return [l.strip() for l in stdout_str.split('\n') if len(l.strip()) > 0]
stdout_str = strip_invalid_utf8(stdout_str)
# Filter out last empty line
return [l.strip() for l in stdout_str.split('\n') if len(l.strip()) > 0]
def adb_devices(target_socs=None):
outputs = sh.grep(sh.adb("devices"), "^[A-Za-z0-9]\+[[:space:]]\+device$")
raw_lists = sh.cut(outputs, "-f1")
device_ids = adb_split_stdout(raw_lists)
if target_socs != None:
target_socs_set = set(target_socs)
target_devices = []
for serialno in device_ids:
props = adb_getprop_by_serialno(serialno)
if props["ro.board.platform"] in target_socs_set:
target_devices.append(serialno)
return target_devices
else:
return device_ids
outputs = sh.grep(sh.adb("devices"), "^[A-Za-z0-9]\+[[:space:]]\+device$")
raw_lists = sh.cut(outputs, "-f1")
device_ids = adb_split_stdout(raw_lists)
if target_socs is not None:
target_socs_set = set(target_socs)
target_devices = []
for serialno in device_ids:
props = adb_getprop_by_serialno(serialno)
if props["ro.board.platform"] in target_socs_set:
target_devices.append(serialno)
return target_devices
else:
return device_ids
def adb_getprop_by_serialno(serialno):
outputs = sh.adb("-s", serialno, "shell", "getprop")
raw_props = adb_split_stdout(outputs)
props = {}
p = re.compile("\[(.+)\]: \[(.+)\]")
for raw_prop in raw_props:
m = p.match(raw_prop)
if m:
props[m.group(1)] = m.group(2)
return props
outputs = sh.adb("-s", serialno, "shell", "getprop")
raw_props = adb_split_stdout(outputs)
props = {}
p = re.compile("\[(.+)\]: \[(.+)\]")
for raw_prop in raw_props:
m = p.match(raw_prop)
if m:
props[m.group(1)] = m.group(2)
return props
def adb_supported_abis(serialno):
props = adb_getprop_by_serialno(serialno)
abilist_str = props["ro.product.cpu.abilist"]
abis = [abi.strip() for abi in abilist_str.split(',')]
return abis
props = adb_getprop_by_serialno(serialno)
abilist_str = props["ro.product.cpu.abilist"]
abis = [abi.strip() for abi in abilist_str.split(',')]
return abis
def adb_get_all_socs():
socs = []
for d in adb_devices():
props = adb_getprop_by_serialno(d)
socs.append(props["ro.board.platform"])
return set(socs)
socs = []
for d in adb_devices():
props = adb_getprop_by_serialno(d)
socs.append(props["ro.board.platform"])
return set(socs)
def adb_run(serialno, host_bin_path, bin_name,
def adb_run(serialno,
host_bin_path,
bin_name,
args="",
opencl_profiling=1,
vlog_level=0,
device_bin_path="/data/local/tmp/mace",
out_of_range_check=1):
host_bin_full_path = "%s/%s" % (host_bin_path, bin_name)
device_bin_full_path = "%s/%s" % (device_bin_path, bin_name)
props = adb_getprop_by_serialno(serialno)
print("=====================================================================")
print("Run on device: %s, %s, %s" % (serialno, props["ro.board.platform"],
props["ro.product.model"]))
sh.adb("-s", serialno, "shell", "rm -rf %s" % device_bin_path)
sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path)
print("Push %s to %s" % (host_bin_full_path, device_bin_full_path))
sh.adb("-s", serialno, "push", host_bin_full_path, device_bin_full_path)
print("Run %s" % device_bin_full_path)
stdout_buff=[]
process_output = make_output_processor(stdout_buff)
p = sh.adb("-s", serialno, "shell",
"MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" %
(out_of_range_check, opencl_profiling, vlog_level, device_bin_full_path, args),
_out=process_output, _bg=True, _err_to_out=True)
p.wait()
return "".join(stdout_buff)
host_bin_full_path = "%s/%s" % (host_bin_path, bin_name)
device_bin_full_path = "%s/%s" % (device_bin_path, bin_name)
props = adb_getprop_by_serialno(serialno)
print(
"====================================================================="
)
print("Run on device: %s, %s, %s" % (serialno, props["ro.board.platform"],
props["ro.product.model"]))
sh.adb("-s", serialno, "shell", "rm -rf %s" % device_bin_path)
sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path)
print("Push %s to %s" % (host_bin_full_path, device_bin_full_path))
sh.adb("-s", serialno, "push", host_bin_full_path, device_bin_full_path)
print("Run %s" % device_bin_full_path)
stdout_buff = []
process_output = make_output_processor(stdout_buff)
p = sh.adb(
"-s",
serialno,
"shell",
"MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d "
"MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" %
(out_of_range_check, opencl_profiling, vlog_level,
device_bin_full_path, args),
_out=process_output,
_bg=True,
_err_to_out=True)
p.wait()
return "".join(stdout_buff)
################################
# bazel commands
################################
def bazel_build(target, strip="always", abi="armeabi-v7a"):
print("Build %s with ABI %s" % (target, abi))
stdout_buff=[]
process_output = make_output_processor(stdout_buff)
p= sh.bazel("build",
"-c", "opt",
"--strip", strip,
"--verbose_failures",
target,
"--crosstool_top=//external:android/crosstool",
"--host_crosstool_top=@bazel_tools//tools/cpp:toolchain",
"--cpu=%s" % abi,
"--copt=-std=c++11",
"--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
"--copt=-DMACE_DISABLE_NO_TUNING_WARNING",
"--copt=-Werror=return-type",
"--copt=-O3",
"--define", "neon=true",
"--define", "openmp=true",
_out=process_output, _bg=True, _err_to_out=True)
p.wait()
return "".join(stdout_buff)
print("Build %s with ABI %s" % (target, abi))
stdout_buff = []
process_output = make_output_processor(stdout_buff)
p = sh.bazel(
"build",
"-c",
"opt",
"--strip",
strip,
"--verbose_failures",
target,
"--crosstool_top=//external:android/crosstool",
"--host_crosstool_top=@bazel_tools//tools/cpp:toolchain",
"--cpu=%s" % abi,
"--copt=-std=c++11",
"--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
"--copt=-DMACE_DISABLE_NO_TUNING_WARNING",
"--copt=-Werror=return-type",
"--copt=-O3",
"--define",
"neon=true",
"--define",
"openmp=true",
_out=process_output,
_bg=True,
_err_to_out=True)
p.wait()
return "".join(stdout_buff)
def bazel_target_to_bin(target):
# change //mace/a/b:c to bazel-bin/mace/a/b/c
prefix, bin_name = target.split(':')
prefix = prefix.replace('//', '/')
if prefix.startswith('/'):
prefix = prefix[1:]
host_bin_path = "bazel-bin/%s" % prefix
return host_bin_path, bin_name
# change //mace/a/b:c to bazel-bin/mace/a/b/c
prefix, bin_name = target.split(':')
prefix = prefix.replace('//', '/')
if prefix.startswith('/'):
prefix = prefix[1:]
host_bin_path = "bazel-bin/%s" % prefix
return host_bin_path, bin_name
################################
# mace commands
################################
# TODO this should be refactored
def gen_encrypted_opencl_source(codegen_path="mace/codegen"):
sh.mkdir("-p", "%s/opencl" % codegen_path)
sh.python("mace/python/tools/encrypt_opencl_codegen.py",
"--cl_kernel_dir=./mace/kernels/opencl/cl/",
"--output_path=%s/opencl/opencl_encrypt_program.cc" % codegen_path)
sh.mkdir("-p", "%s/opencl" % codegen_path)
sh.python(
"mace/python/tools/encrypt_opencl_codegen.py",
"--cl_kernel_dir=./mace/kernels/opencl/cl/",
"--output_path=%s/opencl/opencl_encrypt_program.cc" % codegen_path)
def gen_mace_version(codegen_path="mace/codegen"):
sh.mkdir("-p", "%s/version" % codegen_path)
sh.bash("mace/tools/git/gen_version_source.sh",
"%s/version/version.cc" % codegen_path)
sh.mkdir("-p", "%s/version" % codegen_path)
sh.bash("mace/tools/git/gen_version_source.sh",
"%s/version/version.cc" % codegen_path)
def gen_compiled_opencl_source(codegen_path="mace/codegen"):
sh.mkdir("-p", "%s/opencl" % codegen_path)
sh.python("mace/python/tools/opencl_codegen.py",
"--output_path=%s/opencl/opencl_compiled_program.cc" % codegen_path)
sh.mkdir("-p", "%s/opencl" % codegen_path)
sh.python(
"mace/python/tools/opencl_codegen.py",
"--output_path=%s/opencl/opencl_compiled_program.cc" % codegen_path)
################################
# falcon
################################
def falcon_tags(tags_dict):
tags = ""
for k, v in tags_dict.iteritems():
if tags == "":
tags = "%s=%s" % (k, v)
else:
tags = tags + ",%s=%s" % (k, v)
return tags
tags = ""
for k, v in tags_dict.iteritems():
if tags == "":
tags = "%s=%s" % (k, v)
else:
tags = tags + ",%s=%s" % (k, v)
return tags
def falcon_push_metrics(metrics, endpoint="mace_dev", tags={}):
cli = falcon_cli.FalconCli.connect(server="transfer.falcon.miliao.srv",
port=8433,
debug=False)
ts = int(time.time())
falcon_metrics = [{
"endpoint": endpoint,
"metric": key,
"tags": falcon_tags(tags),
"timestamp": ts,
"value": value,
"step": 86400,
"counterType": "GAUGE"
} for key, value in metrics.iteritems()]
cli.update(falcon_metrics)
def falcon_push_metrics(metrics, endpoint="mace_dev", tags={}):
cli = falcon_cli.FalconCli.connect(
server="transfer.falcon.miliao.srv", port=8433, debug=False)
ts = int(time.time())
falcon_metrics = [{
"endpoint": endpoint,
"metric": key,
"tags": falcon_tags(tags),
"timestamp": ts,
"value": value,
"step": 86400,
"counterType": "GAUGE"
} for key, value in metrics.iteritems()]
cli.update(falcon_metrics)
......@@ -20,175 +20,172 @@ from scipy import stats
# --input_shape 1,64,64,3 \
# --output_shape 1,64,64,2
def load_data(file):
if os.path.isfile(file):
return np.fromfile(file=file, dtype=np.float32)
else:
return np.empty([0])
if os.path.isfile(file):
return np.fromfile(file=file, dtype=np.float32)
else:
return np.empty([0])
def format_output_name(name):
return re.sub('[^0-9a-zA-Z]+', '_', name)
return re.sub('[^0-9a-zA-Z]+', '_', name)
def compare_output(output_name, mace_out_value, out_value):
if mace_out_value.size != 0:
out_value = out_value.reshape(-1)
mace_out_value = mace_out_value.reshape(-1)
assert len(out_value) == len(mace_out_value)
similarity = (1 - spatial.distance.cosine(out_value, mace_out_value))
print output_name, 'MACE VS', FLAGS.platform.upper(), 'similarity: ', similarity
if (FLAGS.mace_runtime == "cpu" and similarity > 0.999) or \
(FLAGS.mace_runtime == "neon" and similarity > 0.999) or \
(FLAGS.mace_runtime == "gpu" and similarity > 0.995) or \
(FLAGS.mace_runtime == "dsp" and similarity > 0.930):
print '=======================Similarity Test Passed======================'
if mace_out_value.size != 0:
out_value = out_value.reshape(-1)
mace_out_value = mace_out_value.reshape(-1)
assert len(out_value) == len(mace_out_value)
similarity = (1 - spatial.distance.cosine(out_value, mace_out_value))
print output_name, 'MACE VS', FLAGS.platform.upper(
), 'similarity: ', similarity
if (FLAGS.mace_runtime == "cpu" and similarity > 0.999) or \
(FLAGS.mace_runtime == "neon" and similarity > 0.999) or \
(FLAGS.mace_runtime == "gpu" and similarity > 0.995) or \
(FLAGS.mace_runtime == "dsp" and similarity > 0.930):
print '===================Similarity Test Passed=================='
else:
print '===================Similarity Test Failed=================='
sys.exit(-1)
else:
print '=======================Similarity Test Failed======================'
sys.exit(-1)
else:
print '=======================Skip empty node==================='
sys.exit(-1)
print '=======================Skip empty node==================='
sys.exit(-1)
def validate_tf_model(input_names, input_shapes, output_names):
import tensorflow as tf
if not os.path.isfile(FLAGS.model_file):
print("Input graph file '" + FLAGS.model_file + "' does not exist!")
sys.exit(-1)
input_graph_def = tf.GraphDef()
with open(FLAGS.model_file, "rb") as f:
data = f.read()
input_graph_def.ParseFromString(data)
tf.import_graph_def(input_graph_def, name="")
with tf.Session() as session:
with session.graph.as_default() as graph:
import tensorflow as tf
if not os.path.isfile(FLAGS.model_file):
print("Input graph file '" + FLAGS.model_file + "' does not exist!")
sys.exit(-1)
input_graph_def = tf.GraphDef()
with open(FLAGS.model_file, "rb") as f:
data = f.read()
input_graph_def.ParseFromString(data)
tf.import_graph_def(input_graph_def, name="")
input_dict = {}
for i in range(len(input_names)):
input_value = load_data(FLAGS.input_file + "_" + input_names[i])
input_value = input_value.reshape(input_shapes[i])
input_node = graph.get_tensor_by_name(input_names[i] + ':0')
input_dict[input_node] = input_value
output_nodes = []
for name in output_names:
output_nodes.extend([graph.get_tensor_by_name(name + ':0')])
output_values = session.run(output_nodes, feed_dict=input_dict)
for i in range(len(output_names)):
output_file_name = FLAGS.mace_out_file + "_" + format_output_name(output_names[i])
mace_out_value = load_data(output_file_name)
compare_output(output_names[i], mace_out_value, output_values[i])
def validate_caffe_model(input_names, input_shapes, output_names, output_shapes):
os.environ['GLOG_minloglevel'] = '1' # suprress Caffe verbose prints
import caffe
if not os.path.isfile(FLAGS.model_file):
print("Input graph file '" + FLAGS.model_file + "' does not exist!")
sys.exit(-1)
if not os.path.isfile(FLAGS.weight_file):
print("Input weight file '" + FLAGS.weight_file + "' does not exist!")
sys.exit(-1)
caffe.set_mode_cpu()
net = caffe.Net(FLAGS.model_file, caffe.TEST, weights=FLAGS.weight_file)
for i in range(len(input_names)):
input_value = load_data(FLAGS.input_file + "_" + input_names[i])
input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1, 2))
input_blob_name = input_names[i]
try:
if input_names[i] in net.top_names:
input_blob_name = net.top_names[input_names[i]][0]
except ValueError:
pass
net.blobs[input_blob_name].data[0] = input_value
net.forward()
for i in range(len(output_names)):
value = net.blobs[net.top_names[output_names[i]][0]].data
out_shape = output_shapes[i]
out_shape[1], out_shape[2], out_shape[3] = out_shape[3], out_shape[1], out_shape[2]
value = value.reshape(out_shape).transpose((0, 2, 3, 1))
output_file_name = FLAGS.mace_out_file + "_" + format_output_name(output_names[i])
mace_out_value = load_data(output_file_name)
compare_output(output_names[i], mace_out_value, value)
with tf.Session() as session:
with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name="")
input_dict = {}
for i in range(len(input_names)):
input_value = load_data(
FLAGS.input_file + "_" + input_names[i])
input_value = input_value.reshape(input_shapes[i])
input_node = graph.get_tensor_by_name(
input_names[i] + ':0')
input_dict[input_node] = input_value
output_nodes = []
for name in output_names:
output_nodes.extend(
[graph.get_tensor_by_name(name + ':0')])
output_values = session.run(output_nodes, feed_dict=input_dict)
for i in range(len(output_names)):
output_file_name = FLAGS.mace_out_file + "_" + \
format_output_name(output_names[i])
mace_out_value = load_data(output_file_name)
compare_output(output_names[i], mace_out_value,
output_values[i])
def validate_caffe_model(input_names, input_shapes, output_names,
output_shapes):
os.environ['GLOG_minloglevel'] = '1' # suprress Caffe verbose prints
import caffe
if not os.path.isfile(FLAGS.model_file):
print("Input graph file '" + FLAGS.model_file + "' does not exist!")
sys.exit(-1)
if not os.path.isfile(FLAGS.weight_file):
print("Input weight file '" + FLAGS.weight_file + "' does not exist!")
sys.exit(-1)
caffe.set_mode_cpu()
net = caffe.Net(FLAGS.model_file, caffe.TEST, weights=FLAGS.weight_file)
for i in range(len(input_names)):
input_value = load_data(FLAGS.input_file + "_" + input_names[i])
input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1,
2))
input_blob_name = input_names[i]
try:
if input_names[i] in net.top_names:
input_blob_name = net.top_names[input_names[i]][0]
except ValueError:
pass
net.blobs[input_blob_name].data[0] = input_value
net.forward()
for i in range(len(output_names)):
value = net.blobs[net.top_names[output_names[i]][0]].data
out_shape = output_shapes[i]
out_shape[1], out_shape[2], out_shape[3] = out_shape[3], out_shape[
1], out_shape[2]
value = value.reshape(out_shape).transpose((0, 2, 3, 1))
output_file_name = FLAGS.mace_out_file + "_" + format_output_name(
output_names[i])
mace_out_value = load_data(output_file_name)
compare_output(output_names[i], mace_out_value, value)
def main(unused_args):
input_names = [name for name in FLAGS.input_node.split(',')]
input_shape_strs = [shape for shape in FLAGS.input_shape.split(':')]
input_shapes = [[int(x) for x in shape.split(',')] for shape in input_shape_strs]
output_names = [name for name in FLAGS.output_node.split(',')]
assert len(input_names) == len(input_shapes)
if FLAGS.platform == 'tensorflow':
validate_tf_model(input_names, input_shapes, output_names)
elif FLAGS.platform == 'caffe':
output_shape_strs = [shape for shape in FLAGS.output_shape.split(':')]
output_shapes = [[int(x) for x in shape.split(',')] for shape in output_shape_strs]
validate_caffe_model(input_names, input_shapes, output_names, output_shapes)
input_names = [name for name in FLAGS.input_node.split(',')]
input_shape_strs = [shape for shape in FLAGS.input_shape.split(':')]
input_shapes = [[int(x) for x in shape.split(',')]
for shape in input_shape_strs]
output_names = [name for name in FLAGS.output_node.split(',')]
assert len(input_names) == len(input_shapes)
if FLAGS.platform == 'tensorflow':
validate_tf_model(input_names, input_shapes, output_names)
elif FLAGS.platform == 'caffe':
output_shape_strs = [shape for shape in FLAGS.output_shape.split(':')]
output_shapes = [[int(x) for x in shape.split(',')]
for shape in output_shape_strs]
validate_caffe_model(input_names, input_shapes, output_names,
output_shapes)
def parse_args():
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--platform",
type=str,
default="",
help="Tensorflow or Caffe.")
parser.add_argument(
"--model_file",
type=str,
default="",
help="TensorFlow or Caffe \'GraphDef\' file to load.")
parser.add_argument(
"--weight_file",
type=str,
default="",
help="caffe model file to load.")
parser.add_argument(
"--input_file",
type=str,
default="",
help="input file.")
parser.add_argument(
"--mace_out_file",
type=str,
default="",
help="mace output file to load.")
parser.add_argument(
"--mace_runtime",
type=str,
default="gpu",
help="mace runtime device.")
parser.add_argument(
"--input_shape",
type=str,
default="1,64,64,3",
help="input shape.")
parser.add_argument(
"--output_shape",
type=str,
default="1,64,64,2",
help="output shape.")
parser.add_argument(
"--input_node",
type=str,
default="input_node",
help="input node")
parser.add_argument(
"--output_node",
type=str,
default="output_node",
help="output node")
return parser.parse_known_args()
"""Parses command line arguments."""
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--platform", type=str, default="", help="Tensorflow or Caffe.")
parser.add_argument(
"--model_file",
type=str,
default="",
help="TensorFlow or Caffe \'GraphDef\' file to load.")
parser.add_argument(
"--weight_file",
type=str,
default="",
help="caffe model file to load.")
parser.add_argument(
"--input_file", type=str, default="", help="input file.")
parser.add_argument(
"--mace_out_file",
type=str,
default="",
help="mace output file to load.")
parser.add_argument(
"--mace_runtime", type=str, default="gpu", help="mace runtime device.")
parser.add_argument(
"--input_shape", type=str, default="1,64,64,3", help="input shape.")
parser.add_argument(
"--output_shape", type=str, default="1,64,64,2", help="output shape.")
parser.add_argument(
"--input_node", type=str, default="input_node", help="input node")
parser.add_argument(
"--output_node", type=str, default="output_node", help="output node")
return parser.parse_known_args()
if __name__ == '__main__':
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
......@@ -11,199 +11,195 @@ G_T = {}
# f(2, 3)
A_T[4] = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]).astype(np.float32)
A[4] = np.transpose(A_T[4])
B_T[4] = np.array([
[1, 0, -1, 0],
[0, 1, 1, 0],
[0, -1, 1, 0],
[0, 1, 0, -1]
]).astype(np.float32)
B_T[4] = np.array([[1, 0, -1, 0], [0, 1, 1, 0], [0, -1, 1, 0],
[0, 1, 0, -1]]).astype(np.float32)
B[4] = np.transpose(B_T[4])
G[4] = np.array([
[1, 0, 0],
[0.5, 0.5, 0.5],
[0.5, -0.5, 0.5],
[0, 0, 1],
[1, 0, 0],
[0.5, 0.5, 0.5],
[0.5, -0.5, 0.5],
[0, 0, 1],
]).astype(np.float32)
G_T[4] = np.transpose(G[4])
# f(4, 3)
A_T[6] = np.array([
[1, 1, 1, 1, 1, 0],
[0, 1, -1, 2, -2, 0],
[0, 1, 1, 4, 4, 0],
[0, 1, -1, 8, -8, 1],
[1, 1, 1, 1, 1, 0],
[0, 1, -1, 2, -2, 0],
[0, 1, 1, 4, 4, 0],
[0, 1, -1, 8, -8, 1],
]).astype(np.float32)
A[6] = np.transpose(A_T[6])
B_T[6] = np.array([
[4, 0, -5, 0, 1, 0],
[0, -4, -4, 1, 1, 0],
[0, 4, -4, -1, 1, 0],
[0, -2, -1, 2, 1, 0],
[0, 2, -1, -2, 1, 0],
[0, 4, 0, -5, 0, 1],
[4, 0, -5, 0, 1, 0],
[0, -4, -4, 1, 1, 0],
[0, 4, -4, -1, 1, 0],
[0, -2, -1, 2, 1, 0],
[0, 2, -1, -2, 1, 0],
[0, 4, 0, -5, 0, 1],
]).astype(np.float32)
B[6] = np.transpose(B_T[6])
G[6] = np.array([
[1/4.0 , 0 , 0 ],
[-1/6.0, -1/6.0 , -1/6.0],
[-1/6.0, 1/6.0 , -1/6.0],
[1/24.0, 1/12.0 , 1/6.0 ],
[1/24.0, -1/12.0, 1/6.0 ],
[ 0 , 0 , 1 ],
[1 / 4.0, 0, 0],
[-1 / 6.0, -1 / 6.0, -1 / 6.0],
[-1 / 6.0, 1 / 6.0, -1 / 6.0],
[1 / 24.0, 1 / 12.0, 1 / 6.0],
[1 / 24.0, -1 / 12.0, 1 / 6.0],
[0, 0, 1],
]).astype(np.float32)
G_T[6] = np.transpose(G[6])
# f(6, 3)
A_T[8] = np.array([
[1, 1, 1 , 1 , 1 , 1 , 1 , 0],
[0, 1, -1, 2 , -2 , 1/2. , -1/2. , 0],
[0, 1, 1 , 4 , 4 , 1/4. , 1/4. , 0],
[0, 1, -1, 8 , -8 , 1/8. , -1/8. , 0],
[0, 1, 1 , 16, 16 , 1/16., 1/16. , 0],
[0, 1, -1, 32, -32, 1/32., -1/32., 1],
[1, 1, 1, 1, 1, 1, 1, 0],
[0, 1, -1, 2, -2, 1 / 2., -1 / 2., 0],
[0, 1, 1, 4, 4, 1 / 4., 1 / 4., 0],
[0, 1, -1, 8, -8, 1 / 8., -1 / 8., 0],
[0, 1, 1, 16, 16, 1 / 16., 1 / 16., 0],
[0, 1, -1, 32, -32, 1 / 32., -1 / 32., 1],
]).astype(np.float32)
A[8] = np.transpose(A_T[8])
B_T[8] = np.array([
[1, 0 , -21/4., 0 , 21/4., 0 , -1, 0],
[0, 1 , 1 , -17/4., -17/4., 1 , 1 , 0],
[0, -1 , 1 , 17/4. , -17/4., -1 , 1 , 0],
[0, 1/2. , 1/4. , -5/2. , -5/4., 2 , 1 , 0],
[0, -1/2., 1/4. , 5/2. , -5/4., -2 , 1 , 0],
[0, 2 , 4 , -5/2. , -5 , 1/2. , 1 , 0],
[0, -2 , 4 , 5/2. , -5 , -1/2. , 1 , 0],
[0, -1 , 0 , 21/4. , 0 , -21/4., 0 , 1],
[1, 0, -21 / 4., 0, 21 / 4., 0, -1, 0],
[0, 1, 1, -17 / 4., -17 / 4., 1, 1, 0],
[0, -1, 1, 17 / 4., -17 / 4., -1, 1, 0],
[0, 1 / 2., 1 / 4., -5 / 2., -5 / 4., 2, 1, 0],
[0, -1 / 2., 1 / 4., 5 / 2., -5 / 4., -2, 1, 0],
[0, 2, 4, -5 / 2., -5, 1 / 2., 1, 0],
[0, -2, 4, 5 / 2., -5, -1 / 2., 1, 0],
[0, -1, 0, 21 / 4., 0, -21 / 4., 0, 1],
]).astype(np.float32)
B[8] = np.transpose(B_T[8])
G[8] = np.array([
[ 1 , 0 , 0 ],
[-2/9. , -2/9. , -2/9.],
[-2/9. , 2/9. , -2/9.],
[1/90. , 1/45. , 2/45.],
[1/90. , -1/45. , 2/45.],
[32/45., 16/45. , 8/45.],
[32/45., -16/45., 8/45.],
[ 0 , 0 , 1 ],
[1, 0, 0],
[-2 / 9., -2 / 9., -2 / 9.],
[-2 / 9., 2 / 9., -2 / 9.],
[1 / 90., 1 / 45., 2 / 45.],
[1 / 90., -1 / 45., 2 / 45.],
[32 / 45., 16 / 45., 8 / 45.],
[32 / 45., -16 / 45., 8 / 45.],
[0, 0, 1],
]).astype(np.float32)
G_T[8] = np.transpose(G[8])
def output_shape(input_shape, filter_shape):
out_shape = np.zeros(4).astype(np.int32)
out_shape[0] = input_shape[0]
out_shape[1] = filter_shape[0]
out_shape[2] = input_shape[2] - 2
out_shape[3] = input_shape[3] - 2
return out_shape
out_shape = np.zeros(4).astype(np.int32)
out_shape[0] = input_shape[0]
out_shape[1] = filter_shape[0]
out_shape[2] = input_shape[2] - 2
out_shape[3] = input_shape[3] - 2
return out_shape
def winograd_conv(m, r, input, filter):
alpha = m + r - 1
print 'Winograd(m = %d, r = %d, tile size=%d' % (m, r, alpha)
alpha_square = alpha * alpha
input_shape = input.shape
filter_shape = filter.shape
out_shape = output_shape(input_shape, filter_shape)
K = filter_shape[0]
C = input_shape[1]
U = np.zeros((K * alpha_square, C))
for k in range(K):
for c in range(C):
u = np.dot(np.dot(G[alpha], filter[k, c, :, :]), G_T[alpha])
for i in range(alpha):
for j in range(alpha) :
U[(i * alpha + j) * K + k, c] = u[i, j]
print 'filter out: ', U.shape
rounded_h = int(math.ceil(out_shape[2] / (m * 1.0)))
rounded_w = int(math.ceil(out_shape[3] / (m * 1.0)))
P = input_shape[0] * rounded_h * rounded_w
V = np.zeros((C * alpha_square, P))
for p in range(P):
for c in range(C):
n = p / (rounded_w * rounded_h)
t = p % (rounded_h * rounded_w)
h_idx = t / rounded_w
w_idx = t % rounded_w
h_start = h_idx * m
w_start = w_idx * m
h_end = min(h_start+alpha, input_shape[2])
w_end = min(w_start+alpha, input_shape[3])
d = np.zeros((alpha, alpha))
d[0:h_end-h_start, 0:w_end-w_start] = \
input[n, c, h_start:h_end, w_start:w_end]
v = np.dot(np.dot(B_T[alpha], d), B[alpha])
for i in range(alpha):
for j in range(alpha):
V[(i*alpha+j)*C + c, p] = v[i, j]
tmp = V.reshape(alpha_square, C, P, 1)
print 'input out: ', tmp.shape
tmp.astype(np.float32).tofile("C")
M = np.zeros((alpha_square * K, P))
for i in range(alpha_square):
u = U[i * K : (i+1) * K, :]
v = V[i * C : (i+1) * C, :]
M[i * K : (i+1) * K, :] = np.dot(u, v)
print 'M shape: ', M.shape
M.astype(np.float32).tofile("gemm")
res = np.zeros((out_shape[0], out_shape[2], out_shape[3], out_shape[1]))
for k in range(K):
for b in range(P):
tm = np.zeros((alpha, alpha))
for i in range(alpha):
for j in range(alpha):
tm[i][j] = M[(i*alpha+j) * K + k, b]
y = np.dot(np.dot(A_T[alpha], tm), A[alpha])
for i in range(m):
for j in range(m):
n = b / (rounded_h * rounded_w)
t = b % (rounded_h * rounded_w)
p = (t / rounded_w) * m + i
q = (t % rounded_w) * m + j
if p >= out_shape[2] or q >= out_shape[3]:
continue
res[n, p, q, k] = y[i, j]
print 'Res shape: ', res.shape
res.astype(np.float32).tofile("res")
return res
alpha = m + r - 1
print 'Winograd(m = %d, r = %d, tile size=%d' % (m, r, alpha)
alpha_square = alpha * alpha
input_shape = input.shape
filter_shape = filter.shape
out_shape = output_shape(input_shape, filter_shape)
K = filter_shape[0]
C = input_shape[1]
U = np.zeros((K * alpha_square, C))
for k in range(K):
for c in range(C):
u = np.dot(np.dot(G[alpha], filter[k, c, :, :]), G_T[alpha])
for i in range(alpha):
for j in range(alpha):
U[(i * alpha + j) * K + k, c] = u[i, j]
print 'filter out: ', U.shape
rounded_h = int(math.ceil(out_shape[2] / (m * 1.0)))
rounded_w = int(math.ceil(out_shape[3] / (m * 1.0)))
P = input_shape[0] * rounded_h * rounded_w
V = np.zeros((C * alpha_square, P))
for p in range(P):
for c in range(C):
n = p / (rounded_w * rounded_h)
t = p % (rounded_h * rounded_w)
h_idx = t / rounded_w
w_idx = t % rounded_w
h_start = h_idx * m
w_start = w_idx * m
h_end = min(h_start + alpha, input_shape[2])
w_end = min(w_start + alpha, input_shape[3])
d = np.zeros((alpha, alpha))
d[0:h_end-h_start, 0:w_end-w_start] = \
input[n, c, h_start:h_end, w_start:w_end]
v = np.dot(np.dot(B_T[alpha], d), B[alpha])
for i in range(alpha):
for j in range(alpha):
V[(i * alpha + j) * C + c, p] = v[i, j]
tmp = V.reshape(alpha_square, C, P, 1)
print 'input out: ', tmp.shape
tmp.astype(np.float32).tofile("C")
M = np.zeros((alpha_square * K, P))
for i in range(alpha_square):
u = U[i * K:(i + 1) * K, :]
v = V[i * C:(i + 1) * C, :]
M[i * K:(i + 1) * K, :] = np.dot(u, v)
print 'M shape: ', M.shape
M.astype(np.float32).tofile("gemm")
res = np.zeros((out_shape[0], out_shape[2], out_shape[3], out_shape[1]))
for k in range(K):
for b in range(P):
tm = np.zeros((alpha, alpha))
for i in range(alpha):
for j in range(alpha):
tm[i][j] = M[(i * alpha + j) * K + k, b]
y = np.dot(np.dot(A_T[alpha], tm), A[alpha])
for i in range(m):
for j in range(m):
n = b / (rounded_h * rounded_w)
t = b % (rounded_h * rounded_w)
p = (t / rounded_w) * m + i
q = (t % rounded_w) * m + j
if p >= out_shape[2] or q >= out_shape[3]:
continue
res[n, p, q, k] = y[i, j]
print 'Res shape: ', res.shape
res.astype(np.float32).tofile("res")
return res
def tf_conv(input, filter):
conv_op = tf.nn.conv2d(input, filter, [1, 1, 1, 1], 'VALID')
with tf.Session() as sess:
res = sess.run(conv_op)
return res
conv_op = tf.nn.conv2d(input, filter, [1, 1, 1, 1], 'VALID')
with tf.Session() as sess:
res = sess.run(conv_op)
return res
def main():
input = np.random.random([5, 23, 29, 15]).astype(np.float32)
# input = np.fromfile(file="A", dtype=np.float32)
# input = input.reshape(1, 3, 3, 5)
print 'input shape: ', input.shape
# input.tofile("A")
filter = np.random.random([3, 3, 15, 13]).astype(np.float32)
tf_out = tf_conv(input, filter)
input = input.transpose((0, 3, 1, 2))
filter = filter.transpose((3, 2, 0, 1))
print 'filter shape: ', filter.shape
# filter.tofile("filter_in")
for i in [2, 4, 6]:
print "==========f(%d,3)==========" % i
winograd_out = winograd_conv(i, 3, input, filter)
res = np.allclose(tf_out, winograd_out)
if res:
print "=========Pass========="
else:
print "=========Failed======="
print "TF: ", tf_out
print "Winograd: ", winograd_out
input = np.random.random([5, 23, 29, 15]).astype(np.float32)
# input = np.fromfile(file="A", dtype=np.float32)
# input = input.reshape(1, 3, 3, 5)
print 'input shape: ', input.shape
# input.tofile("A")
filter = np.random.random([3, 3, 15, 13]).astype(np.float32)
tf_out = tf_conv(input, filter)
input = input.transpose((0, 3, 1, 2))
filter = filter.transpose((3, 2, 0, 1))
print 'filter shape: ', filter.shape
# filter.tofile("filter_in")
for i in [2, 4, 6]:
print "==========f(%d,3)==========" % i
winograd_out = winograd_conv(i, 3, input, filter)
res = np.allclose(tf_out, winograd_out)
if res:
print "=========Pass========="
else:
print "=========Failed======="
print "TF: ", tf_out
print "Winograd: ", winograd_out
if __name__ == '__main__':
main()
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册