提交 3e82ad67 编写于 作者: 李寅

Refactor model converter and transformer

上级 04f7a34a
......@@ -119,11 +119,11 @@ MaceEngine::Impl::Impl(const NetDef *net_def,
LOG(INFO) << "MACE version: " << MaceVersion();
// Set storage path for internal usage
for (auto input_name : input_nodes) {
ws_->CreateTensor(MakeString("mace_input_node_", input_name, ":0"),
ws_->CreateTensor(MakeString("mace_input_node_", input_name),
GetDeviceAllocator(device_type_), DT_FLOAT);
}
for (auto output_name : output_nodes) {
ws_->CreateTensor(MakeString("mace_output_node_", output_name, ":0"),
ws_->CreateTensor(MakeString("mace_output_node_", output_name),
GetDeviceAllocator(device_type_), DT_FLOAT);
}
#ifdef MACE_ENABLE_HEXAGON
......@@ -182,7 +182,7 @@ MaceStatus MaceEngine::Impl::Run(
"The Inputs' shape must be 4-dimension with NHWC format,"
" please use 1 to fill missing dimensions");
Tensor *input_tensor =
ws_->GetTensor(MakeString("mace_input_node_", input.first, ":0"));
ws_->GetTensor(MakeString("mace_input_node_", input.first));
input_tensor->Resize(input.second.shape());
{
Tensor::MappingGuard input_guard(input_tensor);
......@@ -199,7 +199,7 @@ MaceStatus MaceEngine::Impl::Run(
" please use 1 to fill missing dimensions");
}
Tensor *output_tensor =
ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0"));
ws_->GetTensor(MakeString("mace_output_node_", output.first));
output_tensors.push_back(output_tensor);
}
#ifdef MACE_ENABLE_HEXAGON
......@@ -223,7 +223,7 @@ MaceStatus MaceEngine::Impl::Run(
#endif
for (auto &output : *outputs) {
Tensor *output_tensor =
ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0"));
ws_->GetTensor(MakeString("mace_output_node_", output.first));
// save output
if (output_tensor != nullptr && output.second.data() != nullptr) {
Tensor::MappingGuard output_guard(output_tensor);
......
......@@ -18,20 +18,20 @@ namespace mace {
namespace ops {
void Register_FullyConnected(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
FullyConnectedOp<DeviceType::CPU, float>);
#ifdef MACE_ENABLE_OPENCL
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::GPU)
.TypeConstraint<float>("T")
.Build(),
FullyConnectedOp<DeviceType::GPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::GPU)
.TypeConstraint<half>("T")
.Build(),
......
......@@ -37,7 +37,7 @@ void FCBenchmark(
net.AddRandomInput<D, float>("Bias", {out_channel});
if (D == DeviceType::CPU) {
OpDefBuilder("FC", "FullyConnectedTest")
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("Input")
.Input("Weight")
.Input("Bias")
......@@ -52,7 +52,7 @@ void FCBenchmark(
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FC", "FullyConnectedTest")
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputImage")
.Input("WeightImage")
.Input("BiasImage")
......
......@@ -42,7 +42,7 @@ void Simple(const std::vector<index_t> &input_shape,
if (D == DeviceType::CPU) {
net.Transpose2D<D, float>("Weight", "WeightTranspose");
OpDefBuilder("FC", "FullyConnectedTest")
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("Input")
.Input("Weight")
.Input("Bias")
......@@ -59,7 +59,7 @@ void Simple(const std::vector<index_t> &input_shape,
BufferToImage<D, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FC", "FullyConnectedTest")
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputImage")
.Input("WeightImage")
.Input("BiasImage")
......@@ -142,7 +142,7 @@ void Complex(const index_t batch,
"Weight", {out_channel, height * width * channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});
OpDefBuilder("FC", "FullyConnectedTest")
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("Input")
.Input("Weight")
.Input("Bias")
......@@ -166,7 +166,7 @@ void Complex(const index_t batch,
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FC", "FullyConnectedTest")
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputImage")
.Input("WeightImage")
.Input("BiasImage")
......@@ -231,7 +231,7 @@ void TestWXFormat(const index_t batch,
"Weight", {out_channel, height * width * channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel});
OpDefBuilder("FC", "FullyConnectedTest")
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("Input")
.Input("Weight")
.Input("Bias")
......@@ -255,7 +255,7 @@ void TestWXFormat(const index_t batch,
BufferToImage<DeviceType::GPU, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FC", "FullyConnectedTest")
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputImage")
.Input("WeightImage")
.Input("BiasImage")
......
......@@ -10,6 +10,7 @@ enum NetMode {
enum DeviceType {
CPU = 0; // In default, we will use CPU.
GPU = 2;
HEXAGON = 3;
}
enum DataType {
......
py_library(
name = "tf_converter_lib",
name = "converter_lib",
srcs = [
"convert_util.py",
"graph_util.py",
"tf_converter_lib.py",
"tf_dsp_converter_lib.py",
"converter_tool/base_converter.py",
"converter_tool/shape_inference.py",
"converter_tool/tensorflow_converter.py",
"converter_tool/caffe_converter.py",
"converter_tool/transformer.py",
],
srcs_version = "PY2AND3",
deps = [
":memory_optimizer",
"//mace/proto:mace_py",
],
)
py_library(
name = "caffe_converter_lib",
srcs = [
"caffe_converter_lib.py",
],
srcs_version = "PY2AND3",
deps = [
":memory_optimizer",
"//mace/third_party/caffe:caffe_py",
],
)
......@@ -37,22 +30,21 @@ py_library(
)
py_binary(
name = "converter",
srcs = ["converter.py"],
name = "memory_optimizer",
srcs = ["memory_optimizer.py"],
srcs_version = "PY2AND3",
deps = [
":caffe_converter_lib",
":source_converter_lib",
":tf_converter_lib",
"@six_archive//:six",
"//mace/proto:mace_py",
],
)
py_binary(
name = "memory_optimizer",
srcs = ["memory_optimizer.py"],
name = "converter",
srcs = ["converter.py"],
srcs_version = "PY2AND3",
deps = [
"//mace/proto:mace_py",
":converter_lib",
":source_converter_lib",
"@six_archive//:six",
],
)
# Copyright 2018 Xiaomi, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from mace.proto import mace_pb2
from mace.third_party.caffe import caffe_pb2
from mace.python.tools import memory_optimizer
import google.protobuf.text_format
import numpy as np
import math
pooling_type_mode = {'AvgPool': 1, 'MaxPool': 2}
buffer_type_map = {
'CONV2D_FILTER': 0,
'IN_OUT_CHANNEL': 1,
'ARGUMENT': 2,
'IN_OUT_HEIGHT': 3,
'IN_OUT_WIDTH': 4,
'WINOGRAD_FILTER': 5,
'DW_CONV2D_FILTER': 6,
'WEIGHT_HEIGHT': 7,
'WEIGHT_WIDTH': 8,
}
data_type_map = {'DT_HALF': mace_pb2.DT_HALF, 'DT_FLOAT': mace_pb2.DT_FLOAT}
activation_name_map = {
'ReLU': 'RELU',
'Sigmoid': 'SIGMOID',
'TanH': 'TANH',
}
math_type_mode = {
0: 2, # PROD
1: 0, # SUM
2: 5, # MAX
}
MACE_INPUT_NODE_NAME = "mace_input_node"
MACE_OUTPUT_NODE_NAME = "mace_output_node"
OPENCL_IMAGE_MAX_SIZE = 16384
class Operator(object):
def __init__(self, name, type, layer):
self.name = name
self.type = type
self.layer = layer
self.parents = []
self.children = []
self.data = []
self.output_shape_map = {}
def add_parent(self, parent_op):
self.parents.append(parent_op)
parent_op.children.append(self)
def get_single_parent(self):
if len(self.parents) != 1:
raise Exception('Operation %s expected single parent, but got %s' %
(self.name, len(self.parents)))
return self.parents[0]
def BlobToNPArray(blob):
if blob.num != 0:
return (np.asarray(blob.data, dtype=np.float32).reshape(
(blob.num, blob.channels, blob.height, blob.width)))
else:
return np.asarray(blob.data, dtype=np.float32).reshape(blob.shape.dim)
class Shapes(object):
@staticmethod
def conv_pool_shape(input_shape,
filter_shape,
paddings,
strides,
dilations,
round_func,
input_format='NHWC'):
output_shape = np.zeros_like(input_shape)
output_shape[0] = input_shape[0]
if input_format == 'NHWC':
# input format: NHWC, filter format: HWOI
output_shape[1] = int(
round_func((input_shape[1] + paddings[0] - filter_shape[0] -
(filter_shape[0] - 1) *
(dilations[0] - 1)) / float(strides[0]))) + 1
output_shape[2] = int(
round_func((input_shape[2] + paddings[1] - filter_shape[1] -
(filter_shape[1] - 1) *
(dilations[1] - 1)) / float(strides[1]))) + 1
output_shape[3] = filter_shape[2]
elif input_format == 'NCHW':
# input format: NCHW, filter format: OIHW
output_shape[1] = filter_shape[0]
output_shape[2] = int(
round_func((input_shape[2] + paddings[0] - filter_shape[2] -
(filter_shape[2] - 1) *
(dilations[0] - 1)) / float(strides[0]))) + 1
output_shape[3] = int(
round_func((input_shape[3] + paddings[1] - filter_shape[3] -
(filter_shape[3] - 1) *
(dilations[1] - 1)) / float(strides[1]))) + 1
else:
raise Exception("format %s is not supported" % input_format)
return output_shape
@staticmethod
def fully_connected_shape(input_shape, weight_shape, input_format='NHWC'):
if input_format == 'NHWC':
return [input_shape[0], 1, 1, weight_shape[0]]
elif input_format == 'NCHW':
return [input_shape[0], weight_shape[0], 1, 1]
else:
raise Exception("format %s is not supported" % input_format)
@staticmethod
def concat_shape(input_shapes, axis):
output_shape = None
for input_shape in input_shapes:
if output_shape is None:
output_shape = list(input_shape)
else:
output_shape[axis] += input_shape[axis]
return output_shape
@staticmethod
def slice_shape(input_shape, num_output, input_format='NHWC'):
if input_format == 'NHWC':
return [
input_shape[0], input_shape[1], input_shape[2],
input_shape[3] / num_output
]
elif input_format == 'NCHW':
return [
input_shape[0], input_shape[1] / num_output, input_shape[2],
input_shape[3]
]
else:
raise Exception("format %s is not supported" % input_format)
# outputs' name is [op.name + '_' + #]
class CaffeConverter(object):
def __init__(self, caffe_net, weights, net_def, dt, device, winograd):
self.net_def = net_def
self.caffe_net = caffe_net
self.weights = weights
self.dt = dt
self.device = device
self.winograd = winograd
self.resolved_ops = set()
self.ops = []
self.inputs_map = {} # caffe op name -> mace inputs' name
# Add Input operations
top_name_map = {}
inputs = caffe_net.input
for input in inputs:
self.ops.extend([Operator(input, 'Input', None)])
top_name_map[input] = input
layers = caffe_net.layer
# remove train layers and dropout
layers = self.remove_unused_layers(layers)
# Construct graph
# Only support single-output layer
# layer with single output often use the same top name.
self.ops.extend(
[Operator(layer.name, layer.type, layer) for layer in layers])
self.ops_map = {op.name: op for op in self.ops}
output_op_map = {}
for layer in layers:
op = self.ops_map[layer.name]
for input_name in layer.bottom:
assert input_name != layer.name
parent_op = output_op_map.get(input_name)
if parent_op is None:
parent_op = self.ops_map[input_name]
op.add_parent(parent_op)
if op.name not in self.inputs_map:
self.inputs_map[op.name] = []
self.inputs_map[op.name].extend([top_name_map[input_name]])
for i in range(len(layer.top)):
output_name = layer.top[i]
if len(layer.top) == 1:
top_name_map[output_name] = op.name
else:
top_name_map[output_name] = op.name + '_' + str(i)
if output_name == layer.name:
continue
output_op_map[output_name] = op
# Load weights
weights_layers = weights.layer
for layer in weights_layers:
if not layer.blobs:
continue
if layer.name in self.ops_map:
op = self.ops_map[layer.name]
op.data = [BlobToNPArray(blob) for blob in layer.blobs]
# toposort ops
self.ops = self.toposort_ops()
def CommonConvert(self, op, mace_type):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'cpu':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
op_def.name = op.name
op_def.type = mace_type
op_def.input.extend([name + ':0' for name in self.inputs_map[op.name]])
return op_def
def remove_unused_layers(self, layers):
phase_map = {0: 'train', 1: 'test'}
test_layers_names = set()
test_layers = []
for layer in layers:
phase = 'test'
if len(layer.include):
phase = phase_map[layer.include[0].phase]
if len(layer.exclude):
phase = phase_map[layer.exclude[0].phase]
if phase == 'test' and layer.type != 'Dropout':
test_layers.append(layer)
assert layer.name not in test_layers_names
test_layers_names.add(layer.name)
return test_layers
def toposort_ops(self):
sorted_ops = []
temp_visited = set()
visited = set()
def search(op):
if op.name in temp_visited:
raise Exception("The model is not DAG")
if op.name in visited:
return
temp_visited.add(op.name)
for parent_op in op.parents:
search(parent_op)
temp_visited.remove(op.name)
sorted_ops.append(op)
visited.add(op.name)
for op in self.ops:
search(op)
return sorted_ops
def add_buffer_to_image(self, input_name, input_type):
output_name = input_name[:-2] + "_b2i" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'BufferToImage'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'mode'
arg.i = 0
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_image_to_buffer(self, input_name, input_type):
output_name = input_name[:-2] + "_i2b" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_gpu_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'BufferToImage'
op_def.input.extend([new_input_name])
op_def.output.extend([name + ':0'])
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
input_op = self.ops_map[name]
if input_op.layer is not None:
output_shape = input_op.output_shape_map[input_op.layer.top[0]]
else:
output_shape = input_op.output_shape_map[input_op.name]
self.add_output_shape(op_def, output_shape)
def add_gpu_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([name + ':0'])
op_def.output.extend([output_name])
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
def add_tensor(self, name, value):
tensor = self.net_def.tensors.add()
tensor.name = name
shape = list(value.shape)
tensor.dims.extend(shape)
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(value.flat)
@staticmethod
def add_output_shape(op_def, output_shape):
mace_output_shape = mace_pb2.OutputShape()
mace_output_shape.dims.extend(output_shape)
op_def.output_shape.extend([mace_output_shape])
def add_stride_pad_kernel_arg(self, param, op_def):
try:
if len(param.stride) > 1 or len(param.kernel_size) > 1 or len(
param.pad) > 1:
raise Exception(
'Mace does not support multiple stride/kernel_size/pad')
stride = [param.stride[0],
param.stride[0]] if len(param.stride) else [1, 1]
pad = [param.pad[0] * 2,
param.pad[0] * 2] if len(param.pad) else [0, 0]
kernel = [param.kernel_size[0], param.kernel_size[0]] if len(
param.kernel_size) else [0, 0]
except TypeError:
stride = [param.stride, param.stride]
pad = [param.pad * 2, param.pad * 2]
kernel = [param.kernel_size, param.kernel_size]
if param.HasField("stride_h") or param.HasField("stride_w"):
stride = [param.stride_h, param.stride_w]
# Pad
if param.HasField("pad_h") or param.HasField("pad_w"):
pad = [param.pad_h * 2, param.pad_w * 2]
if op_def is not None:
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(stride)
padding_arg = op_def.arg.add()
padding_arg.name = 'padding_values'
padding_arg.ints.extend(pad)
if op_def.type == 'Pooling':
if param.HasField("kernel_h") or param.HasField("kernel_w"):
kernel = [param.kernel_h, param.kernel_w]
return pad, stride, kernel
def convert_conv2d(self, op):
use_winograd = False
if self.device == 'cpu':
use_winograd = self.check_winograd_conv(op)
param = op.layer.convolution_param
is_depthwise = False
if param.HasField('group'):
if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1:
is_depthwise = True
else:
raise Exception("Mace do not support group convolution yet")
if is_depthwise:
op_def = self.CommonConvert(op, 'DepthwiseConv2d')
else:
op_def = self.CommonConvert(op, 'Conv2D')
# Add filter
weight_tensor_name = op.name + '_weight:0'
if self.device == 'cpu':
weight_data = op.data[0]
else:
# OIHW -> HWOI
weight_data = op.data[0].transpose((2, 3, 0, 1))
if use_winograd:
self.convert_winograd_conv_filter_cpu(op, op_def)
elif self.device == 'gpu':
self.add_tensor(weight_tensor_name, weight_data)
buffer_type = "DW_CONV2D_FILTER" \
if is_depthwise else "CONV2D_FILTER"
output_name = self.add_buffer_to_image(weight_tensor_name,
buffer_type)
op_def.input.extend([output_name])
else:
self.add_tensor(weight_tensor_name, weight_data)
op_def.input.extend([weight_tensor_name])
# Add Bias
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0'
bias_data = op.data[1].reshape(-1)
self.add_tensor(bias_tensor_name, bias_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(bias_tensor_name,
"ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([bias_tensor_name])
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, op_def)
dilations = [1, 1]
if len(param.dilation) > 0:
dilation_arg = op_def.arg.add()
dilation_arg.name = 'dilations'
if len(param.dilation) == 1:
dilations = [param.dilation[0], param.dilation[0]]
elif len(param.dilation) == 2:
dilations = [param.dilation[0], param.dilation[1]]
dilation_arg.ints.extend(dilations)
final_op = op
self.resolved_ops.add(op.name)
input_format = 'NCHW' if self.device == 'cpu' else 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
weight_data.shape, paddings, strides, dilations, math.floor,
input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
if len(self.ops_map[final_op.name].children) == 1 and \
self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def check_winograd_conv(self, op):
param = op.layer.convolution_param
filter_shape = np.asarray(op.data[0].shape)
if self.device != 'cpu':
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
if param.HasField('group'):
if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1:
return False # Depthwise conv not support winograd
else:
raise Exception("Mace do not support group convolution yet")
dilations = [1, 1]
if len(param.dilation) > 0:
if len(param.dilation) == 1:
dilations = [param.dilation[0], param.dilation[0]]
elif len(param.dilation) == 2:
dilations = [param.dilation[0], param.dilation[1]]
input_format = 'NCHW' if self.device == 'cpu' else 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, dilations, math.floor,
input_format)
if self.winograd and dilations[0] == 1 and \
(dilations[0] == dilations[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]):
if self.device == 'gpu':
width = output_shape[0] * ((output_shape[1] + 1) / 2) * \
((output_shape[2] + 1) / 2)
return filter_shape[0] == 3 and \
filter_shape[0] == filter_shape[1] and \
(16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
elif self.device == 'cpu':
return filter_shape[2] == 3 and \
filter_shape[2] == filter_shape[3] and \
filter_shape[0] >= 8 and filter_shape[1] >= 8
return False
def convert_winograd_conv_filter_cpu(self, op, op_def):
# Add filter
weight_tensor_name = op.name + '_weight:0'
weight_data = op.data[0] # OIHW
input_shape = op.get_single_parent().output_shape_map[
op.layer.bottom[0]]
if input_shape[2] > 16 and input_shape[3] > 16:
G = np.array([
[1.0, 0.0, 0.0],
[-2.0 / 9, -2.0 / 9, -2.0 / 9],
[-2.0 / 9, 2.0 / 9, -2.0 / 9],
[1.0 / 90, 1.0 / 45, 2.0 / 45],
[1.0 / 90, -1.0 / 45, 2.0 / 45],
[1.0 / 45, 1.0 / 90, 1.0 / 180],
[1.0 / 45, -1.0 / 90, 1.0 / 180],
[0.0, 0.0, 1.0]
], dtype=np.float32)
new_shape = [64, weight_data.shape[0], weight_data.shape[1]] # TOC
else:
G = np.array([
[1.0, 0.0, 0.0],
[0.5, 0.5, 0.5],
[0.5, -0.5, 0.5],
[0.0, 0.0, 1.0],
], dtype=np.float32)
new_shape = [16, weight_data.shape[0], weight_data.shape[1]] # TOC
new_weight_value = G.dot(weight_data).dot(G.T) # [8, O, I, 8]
new_weight_value = new_weight_value.transpose(0, 3, 1, 2)
new_weight_value = new_weight_value.reshape(new_shape)
self.add_tensor(weight_tensor_name, new_weight_value)
op_def.input.extend([weight_tensor_name])
winograd_transformed_arg = op_def.arg.add()
winograd_transformed_arg.name = 'is_filter_transformed'
winograd_transformed_arg.i = 1
def convert_winograd_conv_gpu(self, op):
# Add filter
weight_tensor_name = op.name + '_weight:0'
self.add_tensor(weight_tensor_name, op.data[0])
buffer_type = "WINOGRAD_FILTER"
filter_name = self.add_buffer_to_image(weight_tensor_name, buffer_type)
param = op.layer.convolution_param
paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
filter_shape = np.asarray(op.data[0].shape)
filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI
input_format = 'NHWC'
output_shape = Shapes.conv_pool_shape(
op.get_single_parent().output_shape_map[op.layer.bottom[0]],
filter_shape, paddings, strides, [1, 1], math.floor, input_format)
# Input transform
wt_op = mace_pb2.OperatorDef()
arg = wt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
padding_arg = wt_op.arg.add()
padding_arg.name = 'padding_values'
padding_arg.ints.extend(paddings)
wt_op.name = op.name + '_input_transform'
wt_op.type = 'WinogradTransform'
wt_op.input.extend([name + ':0' for name in self.inputs_map[op.name]])
wt_output_name = wt_op.name + ":0"
wt_op.output.extend([wt_output_name])
wt_output_shape = mace_pb2.OutputShape()
wt_output_width = output_shape[0] * ((
output_shape[1] + 1) / 2) * ((output_shape[2] + 1) / 2)
wt_output_shape.dims.extend(
[16, filter_shape[3], wt_output_width, 1])
wt_op.output_shape.extend([wt_output_shape])
# MatMul
matmul_op = mace_pb2.OperatorDef()
arg = matmul_op.arg.add()
arg.name = 'T'
arg.i = self.dt
matmul_op.name = op.name + '_matmul'
matmul_op.type = 'MatMul'
matmul_op.input.extend([filter_name, wt_output_name])
matmul_output_name = matmul_op.name + ":0"
matmul_op.output.extend([matmul_output_name])
matmul_output_shape = mace_pb2.OutputShape()
matmul_output_shape.dims.extend(
[16, filter_shape[2], wt_output_width, 1])
matmul_op.output_shape.extend([matmul_output_shape])
# Inverse transform
iwt_op = mace_pb2.OperatorDef()
arg = iwt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
batch_arg = iwt_op.arg.add()
batch_arg.name = 'batch'
batch_arg.i = output_shape[0]
height_arg = iwt_op.arg.add()
height_arg.name = 'height'
height_arg.i = output_shape[1]
width_arg = iwt_op.arg.add()
width_arg.name = 'width'
width_arg.i = output_shape[2]
iwt_op.name = op.name + '_inverse_transform'
iwt_op.type = 'WinogradInverseTransform'
iwt_op.input.extend([matmul_output_name])
# Add Bias
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0'
bias_data = op.data[1].reshape(-1)
self.add_tensor(bias_tensor_name, bias_data)
output_name = self.add_buffer_to_image(bias_tensor_name,
"ARGUMENT")
iwt_op.input.extend([output_name])
final_op = op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(op.name)
if len(self.ops_map[final_op.name].children) == 1 and \
self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = iwt_op.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
iwt_op.output.extend([final_op.name + ':0'])
self.add_output_shape(iwt_op, output_shape)
self.net_def.op.extend([wt_op, matmul_op, iwt_op])
def convert_batchnorm(self, op):
if len(op.children) != 1 or op.children[0].type != 'Scale':
raise Exception('Now only support BatchNorm+Scale')
op_def = self.CommonConvert(op, 'FoldedBatchNorm')
scale_op = op.children[0]
epsilon_value = op.layer.batch_norm_param.eps
if op.data[2][0] != 0:
mean_value = (1. / op.data[2][0]) * op.data[0]
var_value = (1. / op.data[2][0]) * op.data[1]
else:
raise RuntimeError('scalar is zero.')
gamma_value = scale_op.data[0]
beta_value = np.zeros_like(mean_value)
if len(scale_op.data) == 2:
beta_value = scale_op.data[1]
scale_value = ((
1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
gamma_value).reshape(-1)
offset_value = ((-mean_value * scale_value) + beta_value).reshape(-1)
input_names = [op.name + '_scale:0', op.name + '_offset:0']
self.add_tensor(input_names[0], scale_value)
self.add_tensor(input_names[1], offset_value)
if self.device == 'gpu':
for name in input_names:
output_name = self.add_buffer_to_image(name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([name for name in input_names])
self.resolved_ops.add(op.name)
self.resolved_ops.add(scale_op.name)
final_op = scale_op
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
if len(self.ops_map[final_op.name].children) == 1 and \
self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def convert_inner_product(self, op):
param = op.layer.inner_product_param
try:
if param.axis != 1 or param.transpose:
raise ValueError(
'Do not support non-default axis and transpose '
'case for innner product')
except AttributeError:
pass
op_def = self.CommonConvert(op, 'FC')
weight_tensor_name = op.name + '_weight:0'
if op.data[0].ndim not in [2, 4]:
raise ValueError('Unexpected weigth ndim.')
if op.data[0].ndim == 4 and list(op.data[0].shape[:2]) != [1, 1]:
raise ValueError(
'Do not support 4D weight with shape [1, 1, *, *]')
input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
weight_data = op.data[0].reshape(-1, op.data[0].shape[-1])
assert weight_data.shape[1] == (
input_shape[1] * input_shape[2] * input_shape[3])
if self.device != 'cpu':
weight_data = weight_data.reshape(-1, input_shape[3],
input_shape[1], input_shape[2])
weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(
weight_data.shape[0], -1)
self.add_tensor(weight_tensor_name, weight_data)
if self.device == 'gpu':
if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE and \
(weight_data.shape[1] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE:
raise Exception(
'Mace gpu do not support FC with weight shape: ' +
str(weight_data.shape))
if input_shape[3] % 4 == 0:
buffer_type = "WEIGHT_WIDTH"
else:
buffer_type = "WEIGHT_HEIGHT"
weight_type_arg = op_def.arg.add()
weight_type_arg.name = 'weight_type'
weight_type_arg.i = buffer_type_map['WEIGHT_HEIGHT']
if buffer_type == "WEIGHT_HEIGHT" and \
(weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE:
raise Exception(
'Mace gpu do not support FC with weight shape: ' +
str(weight_data.shape))
output_name = self.add_buffer_to_image(weight_tensor_name,
buffer_type)
op_def.input.extend([output_name])
else:
op_def.input.extend([weight_tensor_name])
# Add Bias
if len(op.data) == 2:
bias_tensor_name = op.name + '_bias:0'
bias_data = op.data[1].reshape(-1)
self.add_tensor(bias_tensor_name, bias_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(bias_tensor_name,
"ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([bias_tensor_name])
self.resolved_ops.add(op.name)
input_format = 'NCHW' if self.device == 'cpu' else 'NHWC'
output_shape = Shapes.fully_connected_shape(input_shape,
weight_data.shape,
input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
final_op = op
if len(self.ops_map[final_op.name].children) == 1 \
and self.ops_map[final_op.name].children[0].type \
in activation_name_map:
activation_op = self.ops_map[final_op.name].children[0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
final_op = activation_op
final_op.output_shape_map[final_op.layer.top[0]] = output_shape
self.resolved_ops.add(activation_op.name)
op_def.output.extend([final_op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
def convert_pooling(self, op):
op_def = self.CommonConvert(op, 'Pooling')
param = op.layer.pooling_param
paddings, strides, kernels = self.add_stride_pad_kernel_arg(
param, op_def)
if param.pool == caffe_pb2.PoolingParameter.MAX:
pooling_type = "MaxPool"
elif param.pool == caffe_pb2.PoolingParameter.AVE:
pooling_type = "AvgPool"
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[pooling_type]
input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
if param.HasField('global_pooling') and param.global_pooling:
kernels = [input_shape[2], input_shape[3]] \
if self.device == 'cpu' else \
[input_shape[1], input_shape[2]]
kernel_arg = op_def.arg.add()
kernel_arg.name = 'kernels'
kernel_arg.ints.extend(kernels)
if self.device != 'cpu':
filter_shape = [
kernels[0], kernels[1], input_shape[3], input_shape[3]
]
else:
filter_shape = [
input_shape[1], input_shape[1], kernels[0], kernels[1]
]
input_format = 'NCHW' if self.device == 'cpu' else 'NHWC'
output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
paddings, strides, [1, 1],
math.ceil, input_format)
op.output_shape_map[op.layer.top[0]] = output_shape
op_def.output.extend([op.name + ':0'])
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_activation(self, op):
op_def = self.CommonConvert(op, 'Activation')
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = activation_name_map[op.type]
op_def.output.extend([op.name + ':0'])
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_prelu(self, op):
op_def = self.CommonConvert(op, 'Activation')
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = 'PRELU'
alpha_tensor_name = op.name + '_alpha:0'
alpha_data = op.data[0].reshape(-1)
self.add_tensor(alpha_tensor_name, alpha_data)
if self.device == 'gpu':
output_name = self.add_buffer_to_image(alpha_tensor_name,
"ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([alpha_tensor_name])
op_def.output.extend([op.name + ':0'])
output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[
0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_add(self, op):
op_def = self.CommonConvert(op, 'AddN')
op_def.output.extend([op.name + ':0'])
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_concat(self, op):
op_def = self.CommonConvert(op, 'Concat')
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = 3 if self.device != 'cpu' else 1
try:
if op.layer.concat_param.HasFeild('axis'):
axis_arg.i = op.concat_param.axis
elif op.layer.concat_param.HasFeild('concat_dim'):
axis_arg.i = op.concat_param.concat_dim
except AttributeError:
pass
input_shapes = []
for i in range(len(op.parents)):
input_shapes.append(
op.parents[i].output_shape_map[op.layer.bottom[i]])
output_shape = Shapes.concat_shape(input_shapes, axis_arg.i)
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_eltwise(self, op):
op_def = self.CommonConvert(op, 'Eltwise')
param = op.layer.eltwise_param
type_arg = op_def.arg.add()
type_arg.name = 'type'
type_arg.i = math_type_mode[param.operation]
if len(param.coeff) > 0:
coeff_arg = op_def.arg.add()
coeff_arg.name = 'coeff'
coeff_arg.floats.extend(list(param.coeff))
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_slice(self, op):
op_def = self.CommonConvert(op, 'Slice')
if op.layer.HasField('slice_param'):
param = op.layer.slice_param
if param.HasField('axis') and param.axis != 1:
raise Exception(
'Mace do not support slice with axis ' + str(param.axis))
if len(param.slice_point) > 0:
raise Exception('Mace do not support slice with slice_point')
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis_arg.i = 3 if self.device != 'cpu' else 1
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
num_outputs = len(op.layer.top)
input_channels = input_shape[axis_arg.i]
if (input_channels % num_outputs) != 0 or \
(self.device == 'gpu' and
((input_channels / num_outputs) % 4 != 0)):
raise Exception(
'Mace do not support slice with input shape ' +
str(input_shape) + ' and number of output ' + str(num_outputs))
input_format = 'NCHW' if self.device == 'cpu' else 'NHWC'
output_shape = Shapes.slice_shape(input_shape, num_outputs,
input_format)
for i in range(len(op.layer.top)):
op.output_shape_map[op.layer.top[i]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + '_' + str(i) + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_normal_op(self, op):
op_def = self.CommonConvert(op, op.type)
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_reshape(self, op):
op_def = self.CommonConvert(op, 'Reshape')
input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
output_shape = input_shape
shape_param = np.asarray(op.layer.reshape_param.shape.dim)
for i in range(len(shape_param)):
if shape_param[i] != 0:
output_shape[i] = shape_param[i]
shape_arg = op_def.arg.add()
shape_arg.name = 'shape'
shape_arg.ints.extend(output_shape)
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_proposal_op(self, op):
assert self.device == 'cpu'
op_def = self.CommonConvert(op, op.type)
if op.layer.HasField('proposal_param'):
proposal_param = op.layer.proposal_param
feat_stride_arg = op_def.arg.add()
feat_stride_arg.name = 'feat_stride'
feat_stride_arg.i = proposal_param.feat_stride
scales_arg = op_def.arg.add()
scales_arg.name = 'scales'
scales_arg.ints.extend(list(proposal_param.scales))
ratios_arg = op_def.arg.add()
ratios_arg.name = 'ratios'
ratios_arg.floats.extend(list(proposal_param.ratios))
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def convert_psroi_align(self, op):
assert self.device == 'cpu'
op_def = self.CommonConvert(op, op.type)
if op.layer.HasField('psroi_align_param'):
psroi_align_param = op.layer.psroi_align_param
spatial_scale_arg = op_def.arg.add()
spatial_scale_arg.name = 'spatial_scale'
spatial_scale_arg.f = psroi_align_param.spatial_scale
output_dim_arg = op_def.arg.add()
output_dim_arg.name = 'output_dim'
output_dim_arg.i = psroi_align_param.output_dim
group_size_arg = op_def.arg.add()
group_size_arg.name = 'group_size'
group_size_arg.i = psroi_align_param.group_size
output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
op.output_shape_map[op.layer.top[0]] = output_shape
self.add_output_shape(op_def, output_shape)
op_def.output.extend([op.name + ':0'])
self.net_def.op.extend([op_def])
self.resolved_ops.add(op.name)
def replace_in_out_name(self, input_names, output_names):
in_names = set([input_name + ":0" for input_name in input_names])
out_names = set([output_name + ":0" for output_name in output_names])
for op in self.net_def.op:
for i in range(len(op.input)):
if op.input[i] in in_names:
op.input[i] = MACE_INPUT_NODE_NAME + '_' + op.input[i]
if op.input[i] in out_names:
op.input[i] = MACE_OUTPUT_NODE_NAME + '_' + op.input[i]
for i in range(len(op.output)):
if op.output[i] in in_names:
op.output[i] = MACE_INPUT_NODE_NAME + '_' + op.output[i]
if op.output[i] in out_names:
op.output[i] = MACE_OUTPUT_NODE_NAME + '_' + op.output[i]
def add_input_op_shape(self, input_nodes, input_shapes):
assert len(input_nodes) == len(input_shapes)
for i in range(len(input_nodes)):
input_op = self.ops_map[input_nodes[i]]
input_shape = input_shapes[i] if self.device != 'cpu' else \
[input_shapes[i][0], input_shapes[i][3],
input_shapes[i][1], input_shapes[i][2]]
if input_op.layer is not None:
input_op.output_shape_map[input_op.layer.top[0]] = input_shape
else:
input_op.output_shape_map[input_op.name] = input_shape
def add_cpu_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'Transpose'
op_def.input.extend([new_input_name])
op_def.output.extend([name + ':0'])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 3, 1, 2]) # NHWC -> NCHW
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
input_op = self.ops_map[name]
if input_op.layer is not None:
output_shape = input_op.output_shape_map[input_op.layer.top[0]]
else:
output_shape = input_op.output_shape_map[input_op.name]
self.add_output_shape(op_def, output_shape)
def add_cpu_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'Transpose'
op_def.input.extend([name + ':0'])
op_def.output.extend([output_name])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC
input_op = self.ops_map[name]
if input_op.layer is not None:
output_shape = input_op.output_shape_map[input_op.layer.top[0]]
else:
output_shape = input_op.output_shape_map[input_op.name]
self.add_output_shape(op_def,
[output_shape[0], output_shape[2],
output_shape[3], output_shape[1]])
def convert(self, input_nodes, input_shapes, output_nodes):
assert self.ops[0].type == 'Input'
self.add_input_op_shape(input_nodes, input_shapes)
if self.device == 'gpu':
self.add_gpu_input_transform(input_nodes)
if self.device == 'cpu':
self.add_cpu_input_transform(input_nodes)
for op in self.ops:
if op.name in self.resolved_ops:
continue
if op.type == 'Input':
self.resolved_ops.add(op.name)
elif op.type == 'Convolution':
if self.device == 'gpu' and self.check_winograd_conv(op):
self.convert_winograd_conv_gpu(op)
else:
self.convert_conv2d(op)
elif op.type == 'BatchNorm':
self.convert_batchnorm(op)
elif op.type == 'InnerProduct':
self.convert_inner_product(op)
elif op.type == 'Pooling':
self.convert_pooling(op)
elif op.type == 'PReLU':
self.convert_prelu(op)
elif op.type in ['ReLU', 'Sigmoid', 'TanH']:
self.convert_activation(op)
elif op.type == 'Add':
self.convert_add(op)
elif op.type == 'Concat':
self.convert_concat(op)
elif op.type == 'Eltwise':
self.convert_eltwise(op)
elif op.type == 'Slice':
self.convert_slice(op)
elif op.type == 'Reshape':
self.convert_reshape(op)
elif op.type == 'Proposal':
self.convert_proposal_op(op)
elif op.type == 'PSROIAlign':
self.convert_psroi_align(op)
elif op.type in ['Softmax']:
self.convert_normal_op(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name,
op.type))
if self.device == 'gpu':
self.add_gpu_output_transform(output_nodes)
if self.device == 'cpu':
self.add_cpu_output_transform(output_nodes)
for op in self.ops:
if op.name not in self.resolved_ops:
print 'Unresolve Op: %s with type %s' % (op.name, op.type)
def convert_to_mace_pb(model_file, weight_file, input_node_str,
input_shape_str, output_node_str, data_type, device,
winograd):
net_def = mace_pb2.NetDef()
dt = data_type_map[data_type]
caffe_net = caffe_pb2.NetParameter()
with open(model_file, "r") as f:
google.protobuf.text_format.Merge(str(f.read()), caffe_net)
weights = caffe_pb2.NetParameter()
with open(weight_file, "rb") as f:
weights.MergeFromString(f.read())
input_nodes = [x for x in input_node_str.split(',')]
input_shapes = []
if input_shape_str != "":
input_shape_strs = [x for x in input_shape_str.split(':')]
for shape_str in input_shape_strs:
input_shapes.extend([[int(x) for x in shape_str.split(',')]])
output_nodes = [x for x in output_node_str.split(',')]
assert len(input_nodes) == len(input_shapes)
converter = CaffeConverter(caffe_net, weights, net_def, dt, device,
winograd)
converter.convert(input_nodes, input_shapes, output_nodes)
print "PB Converted."
if device == 'gpu':
print "start optimize memory."
memory_optimizer.optimize_gpu_memory(net_def)
print "Memory optimization done."
elif device == 'cpu':
print "start optimize memory."
memory_optimizer.optimize_cpu_memory(net_def)
print "Memory optimization done."
return net_def
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from mace.proto import mace_pb2
......@@ -40,3 +41,8 @@ def tf_dtype_2_mace_dtype(tf_dtype):
if not mace_dtype:
raise Exception("Not supported tensorflow dtype: " + tf_dtype)
return mace_dtype
def mace_check(condition, msg):
if not condition:
raise Exception(msg)
......@@ -16,7 +16,16 @@ import argparse
import sys
import hashlib
import os.path
from mace.proto import mace_pb2
from mace.python.tools import tf_dsp_converter_lib
from mace.python.tools import memory_optimizer
from mace.python.tools import source_converter_lib
from mace.python.tools.converter_tool import base_converter as cvt
from mace.python.tools.converter_tool import tensorflow_converter
from mace.python.tools.converter_tool import caffe_converter
from mace.python.tools.converter_tool import transformer
# ./bazel-bin/mace/python/tools/tf_converter --model_file quantized_test.pb \
# --output quantized_test_dsp.pb \
......@@ -25,6 +34,12 @@ from mace.python.tools import source_converter_lib
FLAGS = None
data_type_map = {'DT_HALF': mace_pb2.DT_HALF,
'DT_FLOAT': mace_pb2.DT_FLOAT}
device_type_map = {'cpu': mace_pb2.CPU,
'gpu': mace_pb2.GPU,
'dsp': mace_pb2.HEXAGON}
def file_checksum(fname):
hash_func = hashlib.sha256()
......@@ -34,6 +49,10 @@ def file_checksum(fname):
return hash_func.hexdigest()
def parse_int_array_from_str(ints_str):
return [int(int_str) for int_str in ints_str.split(',')]
def main(unused_args):
if not os.path.isfile(FLAGS.model_file):
print("Input graph file '" + FLAGS.model_file + "' does not exist!")
......@@ -59,27 +78,64 @@ def main(unused_args):
(weight_checksum, FLAGS.weight_checksum))
sys.exit(-1)
if FLAGS.runtime == 'dsp':
print("DSP not support caffe model yet.")
sys.exit(-1)
if FLAGS.platform not in ['tensorflow', 'caffe']:
print ("platform %s is not supported." % FLAGS.platform)
sys.exit(-1)
if FLAGS.runtime not in ['cpu', 'gpu', 'dsp']:
print ("runtime %s is not supported." % FLAGS.runtime)
sys.exit(-1)
from mace.python.tools import caffe_converter_lib
output_graph_def = caffe_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.weight_file, FLAGS.input_node,
FLAGS.input_shape, FLAGS.output_node, FLAGS.data_type,
FLAGS.runtime, FLAGS.winograd)
elif FLAGS.platform == 'tensorflow':
if FLAGS.runtime == 'dsp':
from mace.python.tools import tf_dsp_converter_lib
if FLAGS.runtime == 'dsp':
if FLAGS.platform == 'tensorflow':
output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.input_node, FLAGS.output_node,
FLAGS.dsp_mode)
else:
from mace.python.tools import tf_converter_lib
output_graph_def = tf_converter_lib.convert_to_mace_pb(
FLAGS.model_file, FLAGS.input_node, FLAGS.input_shape,
FLAGS.output_node, FLAGS.data_type, FLAGS.runtime,
FLAGS.winograd)
print("%s does not support dsp runtime yet." % FLAGS.platform)
sys.exit(-1)
else:
option = cvt.ConverterOption()
option.data_type = data_type_map[FLAGS.data_type]
option.device = device_type_map[FLAGS.runtime]
option.winograd_enabled = bool(FLAGS.winograd)
input_node_names = FLAGS.input_node.split(',')
input_node_shapes = FLAGS.input_shape.split(':')
if len(input_node_names) != len(input_node_shapes):
raise Exception('input node count and shape count do not match.')
for i in xrange(len(input_node_names)):
input_node = cvt.NodeInfo()
input_node.name = input_node_names[i]
input_node.shape = parse_int_array_from_str(FLAGS.input_shape)
option.add_input_node(input_node)
output_node_names = FLAGS.output_node.split(',')
for i in xrange(len(output_node_names)):
output_node = cvt.NodeInfo()
output_node.name = output_node_names[i]
option.add_output_node(output_node)
print("Convert model to mace model.")
if FLAGS.platform == 'tensorflow':
converter = tensorflow_converter.TensorflowConverter(option,
FLAGS.model_file) # noqa
elif FLAGS.platform == 'caffe':
converter = caffe_converter.CaffeConverter(option,
FLAGS.model_file,
FLAGS.weight_file)
output_graph_def = converter.run()
print("Transform model to one that can better run on device.")
# TODO(liuqi/liyin): transform gpu/cpu and merge their ops
mace_transformer = transformer.Transformer(option, output_graph_def)
output_graph_def = mace_transformer.run()
print "start optimize memory."
if FLAGS.runtime == 'gpu':
memory_optimizer.optimize_gpu_memory(output_graph_def)
elif FLAGS.runtime == 'cpu':
memory_optimizer.optimize_cpu_memory(output_graph_def)
print "Memory optimization done."
if FLAGS.output_type == 'source':
source_converter_lib.convert_to_source(
......
from enum import Enum
from mace.proto import mace_pb2
class DataFormat(Enum):
NHWC = 0
NCHW = 1
class FilterFormat(Enum):
HWIO = 0
OIHW = 1
HWOI = 2
class PaddingMode(Enum):
VALID = 0
SAME = 1
FULL = 2
class PoolingType(Enum):
AVG = 1
MAX = 2
class ActivationType(Enum):
NOOP = 0
RELU = 1
RELUX = 2
PRELU = 3
TANH = 4
SIGMOID = 5
class EltwiseType(Enum):
SUM = 0
SUB = 1
PROD = 2
DIV = 3
MIN = 4
MAX = 5
NEG = 6
ABS = 7
SQR_DIFF = 8
POW = 9
MaceSupportedOps = [
'Activation',
'AddN',
'BatchNorm',
'BatchToSpaceND',
'BiasAdd',
'ChannelShuffle',
'Concat',
'Conv2D',
'Deconv2D',
'DepthToSpace',
'DepthwiseConv2d',
'Dequantize',
'Eltwise',
'FoldedBatchNorm',
'FullyConnected',
'LocalResponseNorm',
'MatMul',
'Pad',
'Pooling',
'Proposal',
'PSROIAlign',
'Quantize',
'Requantize',
'Reshape',
'ResizeBilinear',
'Slice',
'Softmax',
'SpaceToBatchND',
'SpaceToDepth',
'Transpose',
'WinogradInverseTransform',
'WinogradTransform',
]
MaceOp = Enum('MaceOp', [(op, op) for op in MaceSupportedOps], type=str)
class MaceKeyword(object):
# node related str
mace_input_node_name = 'mace_input_node'
mace_output_node_name = 'mace_output_node'
mace_buffer_type = 'buffer_type'
mace_mode = 'mode'
mace_buffer_to_image = 'BufferToImage'
mace_image_to_buffer = 'ImageToBuffer'
# arg related str
mace_padding_str = 'padding'
mace_padding_values_str = 'padding_values'
mace_strides_str = 'strides'
mace_dilations_str = 'dilations'
mace_pooling_type_str = 'pooling_type'
mace_global_pooling_str = 'global_pooling'
mace_kernel_str = 'kernels'
mace_data_format_str = 'data_format'
mace_filter_format_str = 'filter_format'
mace_element_type_str = 'type'
mace_activation_type_str = 'activation'
mace_activation_max_limit_str = 'max_limit'
mace_resize_size_str = 'size'
mace_batch_to_space_crops_str = 'crops'
mace_paddings_str = 'paddings'
mace_align_corners_str = 'align_corners'
mace_space_batch_block_shape_str = 'block_shape'
mace_space_depth_block_size_str = 'block_size'
mace_constant_value_str = 'constant_value'
mace_dims_str = 'dims'
mace_axis_str = 'axis'
mace_shape_str = 'shape'
mace_winograd_filter_transformed = 'is_filter_transformed'
class ConverterInterface(object):
"""Base class for converting external models to mace models."""
def run(self):
raise NotImplementedError('run')
class NodeInfo(object):
"""A class for describing node information"""
def __init__(self):
self._name = None
self._shape = []
@property
def name(self):
return self._name
@property
def shape(self):
return self._shape
@name.setter
def name(self, name):
self._name = name
@shape.setter
def shape(self, shape):
self._shape = shape
def __str__(self):
return '%s %s' % (self._name, str(self._shape))
class ConverterOption(object):
"""A class for specifying options passed to converter tool"""
def __init__(self):
self._input_nodes = {}
self._output_nodes = {}
self._data_type = mace_pb2.DT_FLOAT
self._device = mace_pb2.CPU
self._winograd_enabled = False
@property
def input_nodes(self):
return self._input_nodes
@property
def output_nodes(self):
return self._output_nodes
@property
def data_type(self):
return self._data_type
@property
def device(self):
return self._device
@property
def winograd_enabled(self):
return self._winograd_enabled
@input_nodes.setter
def input_nodes(self, input_nodes):
for node in input_nodes:
self._input_nodes[node.name] = node
def add_input_node(self, input_node):
self._input_nodes[input_node.name] = input_node
@output_nodes.setter
def output_nodes(self, output_nodes):
for node in output_nodes:
self.output_nodes[node.name] = node
def add_output_node(self, output_node):
self._output_nodes[output_node.name] = output_node
@data_type.setter
def data_type(self, data_type):
self._data_type = data_type
@device.setter
def device(self, device):
self._device = device
@winograd_enabled.setter
def winograd_enabled(self, winograd_enabled):
self._winograd_enabled = winograd_enabled
class ConverterUtil(object):
@staticmethod
def get_arg(op, arg_name):
for arg in op.arg:
if arg.name == arg_name:
return arg
return None
@staticmethod
def add_data_format_arg(op, data_format):
data_format_arg = op.arg.add()
data_format_arg.name = MaceKeyword.mace_data_format_str
data_format_arg.i = data_format.value
@staticmethod
def data_format(op):
arg = ConverterUtil.get_arg(op, MaceKeyword.mace_data_format_str)
if arg is None:
return None
elif arg.i == DataFormat.NHWC.value:
return DataFormat.NHWC
elif arg.i == DataFormat.NCHW.value:
return DataFormat.NCHW
else:
return None
@staticmethod
def set_filter_format(net, filter_format):
arg = net.arg.add()
arg.name = MaceKeyword.mace_filter_format_str
arg.i = filter_format.value
@staticmethod
def filter_format(net):
arg = ConverterUtil.get_arg(net, MaceKeyword.mace_filter_format_str)
if arg is None:
return None
elif arg.i == FilterFormat.HWIO.value:
return FilterFormat.HWIO
elif arg.i == FilterFormat.HWOI.value:
return FilterFormat.HWOI
elif arg.i == FilterFormat.OIHW.value:
return FilterFormat.OIHW
else:
return None
import math
import numpy as np
import google.protobuf.text_format
from mace.proto import mace_pb2
from mace.third_party.caffe import caffe_pb2
from mace.python.tools.converter_tool import base_converter
from mace.python.tools.converter_tool import shape_inference
from mace.python.tools.converter_tool.base_converter import PoolingType
from mace.python.tools.converter_tool.base_converter import ActivationType
from mace.python.tools.converter_tool.base_converter import EltwiseType
from mace.python.tools.converter_tool.base_converter import DataFormat
from mace.python.tools.converter_tool.base_converter import FilterFormat
from mace.python.tools.converter_tool.base_converter import MaceOp
from mace.python.tools.converter_tool.base_converter import MaceKeyword
from mace.python.tools.converter_tool.base_converter import ConverterUtil
from mace.python.tools.convert_util import mace_check
caffe_group_str = 'group'
caffe_kernel_h_str = 'kernel_h'
caffe_kernel_w_str = 'kernel_w'
caffe_stride_h_str = 'stride_h'
caffe_stride_w_str = 'stride_w'
caffe_pad_h_str = 'pad_h'
caffe_pad_w_str = 'pad_w'
class CaffeOperator(object):
"""CaffeOperator merges and provides both layer and weights information.
Layer records caffe layer proto, while blobs records the weight data in
format of numpy ndarray.
"""
def __init__(self):
self._layer = None
self._blobs = None
@property
def name(self):
return self._layer.name
@property
def type(self):
return self._layer.type
@property
def layer(self):
return self._layer
@property
def blobs(self):
return self._blobs
@layer.setter
def layer(self, layer):
self._layer = layer
@blobs.setter
def blobs(self, blobs):
self._blobs = [self.blob_to_nparray(blob) for blob in blobs]
def get_blob(self, index):
mace_check(index < len(self._blobs), "blob out of index")
return self._blobs[index]
@staticmethod
def blob_to_nparray(blob):
if blob.num != 0:
return (np.asarray(blob.data, dtype=np.float32).reshape(
(blob.num, blob.channels, blob.height, blob.width)))
else:
return np.asarray(blob.data, dtype=np.float32).reshape(
blob.shape.dim)
class CaffeNet(object):
"""CaffeNet contains caffe operations. Output of each layer has unique
name as we replace duplicated output name with unique one, while keep
mace input/output name which user specifies unchanged."""
def __init__(self):
self._ops = {}
self._consumers = {}
# for in-place op, its input name is the same with output name,
# so we change the output name to an alias
self._alias_op_output_name = {}
self._used_op_output_name = set()
@property
def ops(self):
return self._ops.values()
def get_op(self, op_name):
return self._ops.get(op_name, None)
def get_consumers(self, tensor_name):
return self._consumers.get(tensor_name, [])
def add_layer(self, layer):
op = CaffeOperator()
op.layer = layer
self._ops[layer.name] = op
# change op output name if it is an in-place op
layer.bottom[:] = [self._alias_op_output_name.get(layer_input,
layer_input) for
layer_input in layer.bottom][:]
for i in xrange(len(layer.top)):
old_name = layer.top[i]
if layer.type == 'Input':
new_name = old_name
else:
idx = 0
new_name = old_name + '#' + str(idx)
while new_name in self._used_op_output_name:
idx += 1
new_name = old_name + '#' + str(idx)
layer.top[i] = new_name
self._alias_op_output_name[old_name] = new_name
self._used_op_output_name.update([new_name])
for input_tensor in layer.bottom:
if input_tensor not in self._consumers:
self._consumers[input_tensor] = []
self._consumers[input_tensor].append(op)
def add_blob(self, weight):
if weight.name in self._ops:
op = self._ops[weight.name]
op.blobs = list(weight.blobs)
class CaffeConverter(base_converter.ConverterInterface):
"""A class for convert caffe model to mace model."""
pooling_type_mode = {
caffe_pb2.PoolingParameter.AVE: PoolingType.AVG,
caffe_pb2.PoolingParameter.MAX: PoolingType.MAX
}
eltwise_type = {
caffe_pb2.EltwiseParameter.PROD: EltwiseType.PROD,
caffe_pb2.EltwiseParameter.SUM: EltwiseType.SUM,
caffe_pb2.EltwiseParameter.MAX: EltwiseType.MAX,
}
activation_type = {
'ReLU': ActivationType.RELU,
'PReLU': ActivationType.PRELU,
'TanH': ActivationType.TANH,
}
def __init__(self, option, src_model_file, src_weight_file):
self._op_converters = {
'Input': self.convert_nop,
'Convolution': self.convert_conv2d,
'Eltwise': self.convert_elementwise,
'Add': self.convert_add,
'ReLU': self.convert_activation,
'TanH': self.convert_activation,
'Sigmoid': self.convert_activation,
'PReLU': self.convert_activation,
'Pooling': self.convert_pooling,
'Concat': self.convert_concat,
'Slice': self.convert_slice,
'Softmax': self.convert_softmax,
'InnerProduct': self.convert_fully_connected,
'BatchNorm': self.convert_folded_batchnorm,
}
self._option = option
self._mace_net_def = mace_pb2.NetDef()
ConverterUtil.set_filter_format(self._mace_net_def, FilterFormat.OIHW)
self._caffe_net = CaffeNet()
self._caffe_layers = caffe_pb2.NetParameter()
caffe_weights = caffe_pb2.NetParameter()
# parse prototxt
with open(src_model_file, 'rb') as f:
google.protobuf.text_format.Merge(
str(f.read()), self._caffe_layers)
self.filter_test_layers(self._caffe_layers)
for layer in self._caffe_layers.layer:
self._caffe_net.add_layer(layer)
# parse model weight
with open(src_weight_file, 'rb') as f:
caffe_weights.ParseFromString(f.read())
self.filter_test_layers(caffe_weights)
for weight in caffe_weights.layer:
self._caffe_net.add_blob(weight)
self._skip_ops = []
def run(self):
self.convert_ops()
shape_inferer = shape_inference.ShapeInference(
self._mace_net_def,
self._option.input_nodes.values())
shape_inferer.run()
self.replace_output_tensor_name()
return self._mace_net_def
@staticmethod
def replace_input_name(ops, src_name, dst_name):
for op in ops:
for i in xrange(len(op.input)):
if op.input[i] == src_name:
op.input[i] = dst_name
def replace_output_tensor_name(self):
consumers = {}
for op in self._mace_net_def.op:
for input_name in op.input:
if input_name not in consumers:
consumers[input_name] = []
consumers[input_name].append(op)
# replace the last op with same prefix name with the original top name
ops = [op for op in self._mace_net_def.op]
ops.reverse()
visited = set()
for op in ops:
for i in xrange(len(op.output)):
original_output_name = op.output[i].split('#')[0]
if original_output_name not in visited:
self.replace_input_name(
consumers.get(op.output[i], []),
op.output[i],
original_output_name)
op.output[i] = original_output_name
visited.update([original_output_name])
# if user set op name as output node, replace it with op name
for op in self._mace_net_def.op:
if op.name in self._option.output_nodes:
if len(op.output) > 0:
self.replace_input_name(
consumers.get(op.output[0], []),
op.output,
op.name)
op.output[0] = op.name
@staticmethod
def filter_test_layers(layers):
phase_map = {0: 'train', 1: 'test'}
while True:
changed = False
for layer in layers.layer:
phase = 'test'
if len(layer.include):
phase = phase_map[layer.include[0].phase]
if len(layer.exclude):
phase = phase_map[layer.exclude[0].phase]
if phase != 'test' or layer.type == 'Dropout':
print ("Remove layer %s (%s)" % (layer.name, layer.type))
layers.layer.remove(layer)
changed = True
break
if not changed:
break
@staticmethod
def add_stride_pad_kernel_arg(param, op_def):
try:
if len(param.stride) > 1 or len(param.kernel_size) > 1 or len(
param.pad) > 1:
raise Exception(
'Mace does not support multiple stride/kernel_size/pad')
stride = [param.stride[0],
param.stride[0]] if len(param.stride) else [1, 1]
pad = [param.pad[0] * 2,
param.pad[0] * 2] if len(param.pad) else [0, 0]
kernel = [param.kernel_size[0], param.kernel_size[0]] if len(
param.kernel_size) else [0, 0]
except TypeError:
stride = [param.stride, param.stride]
pad = [param.pad * 2, param.pad * 2]
kernel = [param.kernel_size, param.kernel_size]
if param.HasField(caffe_stride_h_str) or param.HasField(
caffe_stride_w_str):
stride = [param.stride_h, param.stride_w]
if param.HasField(caffe_pad_h_str) or param.HasField(caffe_pad_w_str):
pad = [param.pad_h * 2, param.pad_w * 2]
strides_arg = op_def.arg.add()
strides_arg.name = MaceKeyword.mace_strides_str
strides_arg.ints.extend(stride)
padding_arg = op_def.arg.add()
padding_arg.name = MaceKeyword.mace_padding_values_str
padding_arg.ints.extend(pad)
if op_def.type == MaceOp.Pooling.name:
if param.HasField(caffe_kernel_h_str) or param.HasField(
caffe_kernel_w_str):
kernel = [param.kernel_h, param.kernel_w]
kernels_arg = op_def.arg.add()
kernels_arg.name = MaceKeyword.mace_kernel_str
kernels_arg.ints.extend(kernel)
if param.HasField('global_pooling'):
global_pooling_arg = op_def.arg.add()
global_pooling_arg.name = MaceKeyword.mace_global_pooling_str
global_pooling_arg.i = 1
def convert_ops(self):
for layer in self._caffe_layers.layer:
caffe_op = self._caffe_net.get_op(layer.name)
if caffe_op not in self._skip_ops:
mace_check(layer.type in self._op_converters,
"Mace does not support caffe op type %s yet"
% layer.type)
self._op_converters[layer.type](caffe_op)
def add_tensor(self, name, shape, data_type, value):
tensor = self._mace_net_def.tensors.add()
tensor.name = name
tensor.dims.extend(list(shape))
tensor.data_type = data_type
tensor.float_data.extend(value.flat)
def convert_nop(self, layer):
pass
def convert_general_op(self, caffe_op):
op = self._mace_net_def.op.add()
op.name = caffe_op.name
op.type = caffe_op.type
op.input.extend(caffe_op.layer.bottom)
op.output.extend(caffe_op.layer.top)
data_type_arg = op.arg.add()
data_type_arg.name = 'T'
data_type_arg.i = self._option.data_type
ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
return op
def convert_conv2d(self, caffe_op):
op = self.convert_general_op(caffe_op)
param = caffe_op.layer.convolution_param
is_depthwise = False
if param.HasField(caffe_group_str):
mace_check(param.group == caffe_op.blob[0].shape[1] and
caffe_op.blob[0].shape[0] == 1,
"Mace do not support group convolution yet")
is_depthwise = True
if is_depthwise:
op.type = MaceOp.DepthwiseConv2d.name
else:
op.type = MaceOp.Conv2D.name
self.add_stride_pad_kernel_arg(param, op)
# dilation is specific for convolution in caffe
dilations = [1, 1]
if len(param.dilation) > 0:
dilation_arg = op.arg.add()
dilation_arg.name = MaceKeyword.mace_dilations_str
if len(param.dilation) == 1:
dilations = [param.dilation[0], param.dilation[0]]
elif len(param.dilation) == 2:
dilations = [param.dilation[0], param.dilation[1]]
dilation_arg.ints.extend(dilations)
filter_tensor_name = op.name + '_filter'
filter_data = caffe_op.blobs[0]
self.add_tensor(filter_tensor_name, filter_data.shape,
mace_pb2.DT_FLOAT, filter_data)
op.input.extend([filter_tensor_name])
if len(caffe_op.blobs) == 2:
bias_tensor_name = op.name + '_bias'
bias_data = caffe_op.blobs[1]
self.add_tensor(bias_tensor_name, bias_data.shape,
mace_pb2.DT_FLOAT,
bias_data)
op.input.extend([bias_tensor_name])
def convert_elementwise(self, caffe_op):
op = self.convert_general_op(caffe_op)
param = caffe_op.layer.eltwise_param
op.type = MaceOp.Eltwise.name
type_arg = op.arg.add()
type_arg.name = MaceKeyword.mace_element_type_str
type_arg.i = self.eltwise_type[param.operation].value
if len(param.coeff) > 0:
coeff_arg = op.arg.add()
coeff_arg.name = 'coeff'
coeff_arg.floats.extend(list(param.coeff))
def convert_add(self, caffe_op):
op = self.convert_general_op(caffe_op)
op.type = MaceOp.AddN.name
def convert_activation(self, caffe_op):
op = self.convert_general_op(caffe_op)
op.type = MaceOp.Activation.name
type_arg = op.arg.add()
type_arg.name = MaceKeyword.mace_activation_type_str
type_arg.s = self.activation_type[caffe_op.type].name
if caffe_op.type == 'PReLU':
alpha_tensor_name = caffe_op.name + '_alpha'
alpha_data = caffe_op.blobs[0]
self.add_tensor(alpha_tensor_name, alpha_data.shape,
mace_pb2.DT_FLOAT, alpha_data)
op.input.extend([alpha_tensor_name])
def convert_folded_batchnorm(self, caffe_op):
op = self.convert_general_op(caffe_op)
op.type = MaceOp.FoldedBatchNorm.name
scale_op = None
for consumer in self._caffe_net.get_consumers(caffe_op.layer.top[0]):
if consumer.type == 'Scale':
scale_op = consumer
mace_check(scale_op is not None, "batchnorm is not followed by scale")
self._skip_ops.append(scale_op)
epsilon_value = caffe_op.layer.batch_norm_param.eps
mace_check(caffe_op.blobs[2][0] != 0, "batchnorm scalar is zero")
mean_value = (1. / caffe_op.blobs[2][0]) * caffe_op.blobs[0]
var_value = (1. / caffe_op.blobs[2][0]) * caffe_op.blobs[1]
gamma_value = scale_op.blobs[0]
beta_value = np.zeros_like(mean_value)
if len(scale_op.blobs) == 2:
beta_value = scale_op.blobs[1]
scale_value = (
(1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
gamma_value).reshape(-1)
offset_value = ((-mean_value * scale_value) + beta_value).reshape(-1)
input_names = [op.name + '_scale', op.name + '_offset']
self.add_tensor(input_names[0], scale_value.shape, mace_pb2.DT_FLOAT,
scale_value)
self.add_tensor(input_names[1], offset_value.shape, mace_pb2.DT_FLOAT,
offset_value)
op.input.extend([name for name in input_names])
op.output[:] = scale_op.layer.top[:]
def convert_pooling(self, caffe_op):
op = self.convert_general_op(caffe_op)
param = caffe_op.layer.pooling_param
op.type = MaceOp.Pooling.name
self.add_stride_pad_kernel_arg(param, op)
pooling_type_arg = op.arg.add()
pooling_type_arg.name = MaceKeyword.mace_pooling_type_str
pooling_type_arg.i = self.pooling_type_mode[param.pool].value
def convert_softmax(self, caffe_op):
self.convert_general_op(caffe_op)
def convert_concat(self, caffe_op):
op = self.convert_general_op(caffe_op)
param = caffe_op.layer.concat_param
op.type = MaceOp.Concat.name
axis_arg = op.arg.add()
axis_arg.name = MaceKeyword.mace_axis_str
axis_arg.i = 1
if param.HasField('axis'):
axis_arg.i = param.axis
elif param.HasField('concat_dim'):
axis_arg.i = param.concat_dim
mace_check(axis_arg.i == 1, "only support concat at channel dimension")
def convert_slice(self, caffe_op):
op = self.convert_general_op(caffe_op)
op.type = MaceOp.Slice.name
if caffe_op.layer.HasField('slice_param'):
param = caffe_op.layer.slice_param
mace_check(not param.HasField('axis') or param.axis == 1,
"Mace do not support slice with axis %d" % param.axis)
mace_check(len(param.slice_point) == 0,
"Mace do not support slice with slice_point")
axis_arg = op.arg.add()
axis_arg.name = MaceKeyword.mace_axis_str
axis_arg.i = 1
def convert_fully_connected(self, caffe_op):
op = self.convert_general_op(caffe_op)
param = caffe_op.layer.inner_product_param
op.type = MaceOp.FullyConnected.name
mace_check(param.axis == 1 and not param.transpose,
"Do not support non-default axis and transpose")
mace_check(caffe_op.blobs[0].ndim in [2, 4],
"Unexpected fc weigth ndim.")
if caffe_op.blobs[0].ndim == 4:
mace_check(list(caffe_op.blobs[0].shape[:2]) == [1, 1],
"Do not support 4D weight with shape [1, 1, *, *]")
weight_tensor_name = op.name + '_weight'
weight_data = caffe_op.blobs[0].reshape(param.num_output, -1)
self.add_tensor(weight_tensor_name, weight_data.shape,
mace_pb2.DT_FLOAT,
weight_data)
op.input.extend([weight_tensor_name])
if len(caffe_op.blobs) == 2:
bias_tensor_name = op.name + '_bias'
bias_data = caffe_op.blobs[1]
self.add_tensor(bias_tensor_name, bias_data.shape,
mace_pb2.DT_FLOAT,
bias_data)
op.input.extend([bias_tensor_name])
import math
import numpy as np
from mace.python.tools.converter_tool.transformer import Transformer
from mace.python.tools.converter_tool.base_converter import DataFormat
from mace.python.tools.converter_tool.base_converter import FilterFormat
from mace.python.tools.converter_tool.base_converter import MaceOp
from mace.python.tools.converter_tool.base_converter import MaceKeyword
from mace.python.tools.converter_tool.base_converter import ConverterUtil
from mace.python.tools.convert_util import mace_check
class ShapeInference(object):
"""Currently we only use it to infer caffe shape, we use tensorflow engine
to infer tensorflow op shapes, since tensorflow has too many ops."""
def __init__(self, net, input_nodes):
self._op_shape_inference = {
MaceOp.Conv2D.name: self.infer_shape_conv_pool_shape,
MaceOp.Eltwise.name: self.infer_shape_general,
MaceOp.FoldedBatchNorm.name: self.infer_shape_general,
MaceOp.AddN.name: self.infer_shape_general,
MaceOp.Activation.name: self.infer_shape_general,
MaceOp.Pooling.name: self.infer_shape_conv_pool_shape,
MaceOp.Concat.name: self.infer_shape_concat,
MaceOp.Slice.name: self.infer_shape_slice,
MaceOp.Softmax.name: self.infer_shape_general,
MaceOp.FullyConnected.name: self.infer_shape_fully_connected,
}
self._net = net
self._output_shape_cache = {}
for input_node in input_nodes:
input_shape = input_node.shape[:]
# transpose input from NCHW to NHWC
Transformer.transpose_shape(input_shape, [0, 3, 1, 2])
self._output_shape_cache[input_node.name] = input_shape
for tensor in net.tensors:
self._output_shape_cache[tensor.name] = list(tensor.dims)
def run(self):
for op in self._net.op:
mace_check(op.type in self._op_shape_inference,
"Mace does not support caffe op type %s yet"
% op.type)
self._op_shape_inference[op.type](op)
def add_output_shape(self, op, shapes):
mace_check(len(op.output) == len(shapes),
"Op %s (%s) output count is different from "
"output shape count" % (
op.name, op.type))
for i in xrange(len(shapes)):
output_name = op.output[i]
output_shape = op.output_shape.add()
output_shape.dims.extend(shapes[i])
self._output_shape_cache[output_name] = shapes[i]
def infer_shape_general(self, op):
if len(op.input) > 0:
mace_check(op.input[0] in self._output_shape_cache,
"%s does not exist" % op.input[0])
input_shape = self._output_shape_cache[op.input[0]]
self.add_output_shape(op, [input_shape])
def infer_shape_conv_pool_shape(self, op):
input_shape = self._output_shape_cache[op.input[0]]
output_shape = np.zeros_like(input_shape)
if op.type == MaceOp.Pooling:
filter_shape = list(
ConverterUtil.get_arg(op, MaceKeyword.mace_kernel_str).ints)
if ConverterUtil.data_format(op) == DataFormat.NCHW:
filter_shape = [input_shape[1], input_shape[1]] + filter_shape
if ConverterUtil.get_arg(op,
MaceKeyword.mace_global_pooling_str) \
is not None:
filter_shape[2] = input_shape[2]
filter_shape[3] = input_shape[3]
else: # NHWC
filter_shape = filter_shape + [input_shape[1], input_shape[1]]
if ConverterUtil.get_arg(op,
MaceKeyword.mace_global_pooling_str) \
is not None:
filter_shape[0] = input_shape[1]
filter_shape[1] = input_shape[2]
else:
filter_shape = self._output_shape_cache[op.input[1]]
paddings = ConverterUtil.get_arg(op,
MaceKeyword.mace_padding_values_str).ints # noqa
strides = ConverterUtil.get_arg(op, MaceKeyword.mace_strides_str).ints
dilations_arg = ConverterUtil.get_arg(op,
MaceKeyword.mace_dilations_str)
if dilations_arg is not None:
dilations = dilations_arg.ints
else:
dilations = [1, 1]
if op.type == MaceOp.Pooling:
round_func = math.ceil
else:
round_func = math.floor
output_shape[0] = input_shape[0]
if ConverterUtil.data_format(op) == DataFormat.NCHW \
and ConverterUtil.filter_format(self._net) == FilterFormat.OIHW: # noqa
# filter format: OIHW
output_shape[1] = filter_shape[0]
output_shape[2] = int(
round_func((input_shape[2] + paddings[0] - filter_shape[2] -
(filter_shape[2] - 1) *
(dilations[0] - 1)) / float(strides[0]))) + 1
output_shape[3] = int(
round_func((input_shape[3] + paddings[1] - filter_shape[3] -
(filter_shape[3] - 1) *
(dilations[1] - 1)) / float(strides[1]))) + 1
else:
mace_check(False,
"Mace can only infer shape for"
" NCHW input and OIHW filter")
self.add_output_shape(op, [output_shape])
def infer_shape_concat(self, op):
output_shape = self._output_shape_cache[op.input[0]]
axis = ConverterUtil.get_arg(op, MaceKeyword.mace_axis_str).i
for input_node in op.input:
input_shape = self._output_shape_cache[input_node]
output_shape[axis] += input_shape[axis]
self.add_output_shape(op, [output_shape])
def infer_shape_slice(self, op):
output_shape = self._output_shape_cache[op.input[0]]
axis = ConverterUtil.get_arg(op, MaceKeyword.mace_axis_str).i
output_shape[axis] /= len(op.output)
output_shapes = []
for _ in op.output:
output_shapes.append(output_shape)
self.add_output_shape(op, output_shapes)
def infer_shape_fully_connected(self, op):
input_shape = self._output_shape_cache[op.input[0]]
weight_shape = self._output_shape_cache[op.input[1]]
if ConverterUtil.data_format(op) == DataFormat.NCHW:
output_shape = [input_shape[0], weight_shape[0], 1, 1]
else:
mace_check(False, "format %s is not supported"
% ConverterUtil.data_format(op))
self.add_output_shape(op, [output_shape])
import math
import numpy as np
import tensorflow as tf
from mace.proto import mace_pb2
from mace.python.tools.converter_tool import base_converter
from mace.python.tools.converter_tool.base_converter import PoolingType
from mace.python.tools.converter_tool.base_converter import PaddingMode
from mace.python.tools.converter_tool.base_converter import ActivationType
from mace.python.tools.converter_tool.base_converter import EltwiseType
from mace.python.tools.converter_tool.base_converter import DataFormat
from mace.python.tools.converter_tool.base_converter import FilterFormat
from mace.python.tools.converter_tool.base_converter import MaceOp
from mace.python.tools.converter_tool.base_converter import MaceKeyword
from mace.python.tools.converter_tool.base_converter import ConverterUtil
from mace.python.tools.convert_util import mace_check
from tensorflow.core.framework import tensor_shape_pb2
tf_padding_str = 'padding'
tf_strides_str = 'strides'
tf_dilations_str = 'dilations'
tf_data_format_str = 'data_format'
tf_kernel_str = 'ksize'
tf_epsilon_str = 'epsilon'
tf_align_corners = 'align_corners'
tf_block_size = 'block_size'
class TensorflowConverter(base_converter.ConverterInterface):
"""A class for convert tensorflow frozen model to mace model.
We use tensorflow engine to infer op output shapes, since they are of
too many types."""
padding_mode = {
'VALID': PaddingMode.VALID,
'SAME': PaddingMode.SAME,
'FULL': PaddingMode.FULL
}
pooling_type_mode = {
'AvgPool': PoolingType.AVG,
'MaxPool': PoolingType.MAX
}
eltwise_type = {
'Add': EltwiseType.SUM,
'Sub': EltwiseType.SUB,
'Mul': EltwiseType.PROD,
'Div': EltwiseType.DIV,
'Min': EltwiseType.MIN,
'Max': EltwiseType.MAX,
'Neg': EltwiseType.NEG,
'Abs': EltwiseType.ABS,
'RealDiv': EltwiseType.DIV,
'SquaredDifference': EltwiseType.SQR_DIFF,
'Pow': EltwiseType.POW
}
activation_type = {
'Relu': ActivationType.RELU,
'Relu6': ActivationType.RELUX,
'Tanh': ActivationType.TANH,
'Sigmoid': ActivationType.SIGMOID
}
def __init__(self, option, src_model_file):
self._op_converters = {
'Conv2D': self.convert_conv2d,
'DepthwiseConv2dNative': self.convert_conv2d,
'Conv2DBackpropInput': self.convert_conv2d,
'BiasAdd': self.convert_biasadd,
'Add': self.convert_add,
'Sub': self.convert_elementwise,
'Mul': self.convert_elementwise,
'Div': self.convert_elementwise,
'Min': self.convert_elementwise,
'Max': self.convert_elementwise,
'Neg': self.convert_elementwise,
'Abs': self.convert_elementwise,
'RealDiv': self.convert_elementwise,
'SquaredDifference': self.convert_elementwise,
'Pow': self.convert_elementwise,
'Relu': self.convert_activation,
'Relu6': self.convert_activation,
'Tanh': self.convert_activation,
'Sigmoid': self.convert_activation,
'FusedBatchNorm': self.convert_fused_batchnorm,
'AvgPool': self.convert_pooling,
'MaxPool': self.convert_pooling,
'Squeeze': self.convert_identity,
'Reshape': self.convert_reshape,
'Shape': self.convert_nop,
'Softmax': self.convert_softmax,
'ResizeBilinear': self.convert_resize_bilinear,
'Placeholder': self.convert_nop,
'SpaceToBatchND': self.convert_space_batch,
'BatchToSpaceND': self.convert_space_batch,
'DepthToSpace': self.convert_space_depth,
'SpaceToDepth': self.convert_space_depth,
'Pad': self.convert_pad,
'ConcatV2': self.convert_concat,
'Mean': self.convert_mean,
# Const converter_tool should be placed at the end
'Const': self.convert_tensor,
}
self._option = option
self._mace_net_def = mace_pb2.NetDef()
ConverterUtil.set_filter_format(self._mace_net_def, FilterFormat.HWIO)
tf_graph_def = tf.GraphDef()
with tf.gfile.Open(src_model_file, 'rb') as f:
tf_graph_def.ParseFromString(f.read())
self.add_shape_info(tf_graph_def)
with tf.Session() as session:
with session.graph.as_default() as graph:
tf.import_graph_def(tf_graph_def, name='')
self._tf_graph = graph
self._skip_tensor = set()
def run(self):
with tf.Session() as session:
self.convert_ops()
self.replace_input_output_tensor_name()
return self._mace_net_def
def replace_input_output_tensor_name(self):
for op in self._mace_net_def.op:
for i in xrange(len(op.input)):
if op.input[i][-2:] == ':0':
op_name = op.input[i][:-2]
if op_name in self._option.input_nodes:
op.input[i] = op_name
for i in xrange(len(op.output)):
if op.output[i][-2:] == ':0':
op_name = op.output[i][:-2]
if op_name in self._option.output_nodes:
op.output[i] = op_name
def add_shape_info(self, tf_graph_def):
for node in tf_graph_def.node:
if node.name in self._option.input_nodes:
del node.attr['shape'].shape.dim[:]
node.attr['shape'].shape.dim.extend([
tensor_shape_pb2.TensorShapeProto.Dim(size=i) for i in
self._option.input_nodes[node.name].shape
])
@staticmethod
def get_scope(tensor_name):
idx = tensor_name.rfind('/')
if idx == -1:
return tensor_name
else:
return tensor_name[:idx]
def convert_ops(self):
for tf_op in self._tf_graph.get_operations():
mace_check(tf_op.type in self._op_converters,
"Mace does not support tensorflow op type %s yet"
% tf_op.type)
self._op_converters[tf_op.type](tf_op)
def convert_tensor(self, tf_op):
output_name = tf_op.outputs[0].name
if output_name not in self._skip_tensor:
tensor = self._mace_net_def.tensors.add()
tensor.name = tf_op.outputs[0].name
tf_tensor = tf_op.outputs[0].eval()
tensor.dims.extend(list(tf_tensor.shape))
tf_dt = tf_op.get_attr('dtype')
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
else:
mace_check(False, "Not supported tensor type: %s" % tf_dt.name)
def add_tensor(self, name, shape, data_type, value):
tensor = self._mace_net_def.tensors.add()
tensor.name = name
tensor.dims.extend(list(shape))
tensor.data_type = data_type
tensor.float_data.extend(value.flat)
def convert_nop(self, tf_op):
pass
def convert_general_op(self, tf_op):
op = self._mace_net_def.op.add()
op.name = tf_op.name
op.type = tf_op.type
op.input.extend([tf_input.name for tf_input in tf_op.inputs])
op.output.extend([tf_output.name for tf_output in tf_op.outputs])
for tf_output in tf_op.outputs:
output_shape = op.output_shape.add()
output_shape.dims.extend(tf_output.shape.as_list())
op.output_type.append(self._option.data_type)
data_type_arg = op.arg.add()
data_type_arg.name = 'T'
data_type_arg.i = self._option.data_type
ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
return op
def convert_identity(self, tf_op):
op = self.convert_general_op(tf_op)
op.type = 'Identity'
def convert_conv2d(self, tf_op):
op = self.convert_general_op(tf_op)
if tf_op.type == 'DepthwiseConv2dNative':
op.type = MaceOp.DepthwiseConv2d.name
elif tf_op.type == 'Conv2DBackpropInput':
op.type = MaceOp.Deconv2D.name
else:
op.type = MaceOp.Conv2D.name
padding_arg = op.arg.add()
padding_arg.name = MaceKeyword.mace_padding_str
padding_arg.i = self.padding_mode[tf_op.get_attr(tf_padding_str)].value
strides_arg = op.arg.add()
strides_arg.name = MaceKeyword.mace_strides_str
strides_arg.ints.extend(tf_op.get_attr(tf_strides_str)[1:3])
if op.type != MaceOp.Deconv2D.name:
dilation_arg = op.arg.add()
dilation_arg.name = MaceKeyword.mace_dilations_str
dilation_arg.ints.extend(tf_op.get_attr(tf_dilations_str)[1:3])
def convert_elementwise(self, tf_op):
op = self.convert_general_op(tf_op)
op.type = MaceOp.Eltwise.name
type_arg = op.arg.add()
type_arg.name = MaceKeyword.mace_element_type_str
type_arg.i = self.eltwise_type[tf_op.type].value
def convert_biasadd(self, tf_op):
op = self.convert_general_op(tf_op)
op.type = MaceOp.BiasAdd.name
def convert_add(self, tf_op):
if len(tf_op.inputs) == 2:
self.convert_elementwise(tf_op)
else:
op = self.convert_general_op(tf_op)
op.type = MaceOp.AddN.name
def convert_activation(self, tf_op):
op = self.convert_general_op(tf_op)
op.type = MaceOp.Activation.name
type_arg = op.arg.add()
type_arg.name = MaceKeyword.mace_activation_type_str
type_arg.s = self.activation_type[tf_op.type].name
if tf_op.type == 'Relu6':
limit_arg = op.arg.add()
limit_arg.name = MaceKeyword.mace_activation_max_limit_str
limit_arg.f = 6.0
def convert_fused_batchnorm(self, tf_op):
op = self.convert_general_op(tf_op)
op.type = MaceOp.FoldedBatchNorm.name
gamma_value = tf_op.inputs[1].eval().astype(np.float32)
beta_value = tf_op.inputs[2].eval().astype(np.float32)
mean_value = tf_op.inputs[3].eval().astype(np.float32)
var_value = tf_op.inputs[4].eval().astype(np.float32)
epsilon_value = tf_op.get_attr(tf_epsilon_str)
scale_name = self.get_scope(tf_op.name) + '/scale:0'
offset_name = self.get_scope(tf_op.name) + '/offset:0'
scale_value = (
(1.0 / np.vectorize(math.sqrt)(
var_value + epsilon_value)) * gamma_value)
offset_value = (-mean_value * scale_value) + beta_value
self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT,
scale_value)
self.add_tensor(offset_name, offset_value.shape, mace_pb2.DT_FLOAT,
offset_value)
self._skip_tensor.update([inp.name for inp in tf_op.inputs][1:])
del op.input[1:]
op.input.extend([scale_name, offset_name])
del op.output[1:]
del op.output_shape[1:]
del op.output_type[1:]
def convert_pooling(self, tf_op):
op = self.convert_general_op(tf_op)
op.type = MaceOp.Pooling.name
pooling_type_arg = op.arg.add()
pooling_type_arg.name = MaceKeyword.mace_pooling_type_str
pooling_type_arg.i = self.pooling_type_mode[tf_op.type].value
padding_arg = op.arg.add()
padding_arg.name = MaceKeyword.mace_padding_str
padding_arg.i = self.padding_mode[tf_op.get_attr(tf_padding_str)].value
strides_arg = op.arg.add()
strides_arg.name = MaceKeyword.mace_strides_str
strides_arg.ints.extend(tf_op.get_attr(tf_strides_str)[1:3])
kernels_arg = op.arg.add()
kernels_arg.name = MaceKeyword.mace_kernel_str
kernels_arg.ints.extend(tf_op.get_attr(tf_kernel_str)[1:3])
def convert_softmax(self, tf_op):
op = self.convert_general_op(tf_op)
op.type = MaceOp.Softmax.name
def convert_resize_bilinear(self, tf_op):
op = self.convert_general_op(tf_op)
op.type = MaceOp.ResizeBilinear.name
del op.input[1:]
size_arg = op.arg.add()
size_arg.name = MaceKeyword.mace_resize_size_str
size_value = tf_op.inputs[1].eval().astype(np.int32)
size_arg.ints.extend(size_value)
self._skip_tensor.update(tf_op.inputs[1].name)
align_corners_arg = op.arg.add()
align_corners_arg.name = MaceKeyword.mace_align_corners_str
align_corners_arg.i = tf_op.get_attr(tf_align_corners)
def convert_space_batch(self, tf_op):
print """You might want to try 'flatten_atrous_conv' in
transform graph to turn atrous conv2d into regular conv2d.
This may give you performance benefit on GPU.
(see https://github.com/tensorflow/tensorflow/blob/master/
tensorflow/tools/graph_transforms/README.md#flatten_atrous_conv)
"""
op = self.convert_general_op(tf_op)
del op.input[1:]
size_arg = op.arg.add()
size_arg.name = MaceKeyword.mace_space_batch_block_shape_str
size_value = tf_op.inputs[1].eval().astype(np.int32)
size_arg.ints.extend(size_value)
crops_or_paddings_arg = op.arg.add()
if op.type == 'BatchToSpaceND':
op.type = MaceOp.BatchToSpaceND.name
crops_or_paddings_arg.name = \
MaceKeyword.mace_batch_to_space_crops_str
else:
op.type = MaceOp.SpaceToBatchND.name
crops_or_paddings_arg.name = MaceKeyword.mace_paddings_str
crops_or_paddings_value = tf_op.inputs[2].eval().astype(np.int32).flat
crops_or_paddings_arg.ints.extend(crops_or_paddings_value)
self._skip_tensor.update(tf_op.inputs[1].name)
self._skip_tensor.update(tf_op.inputs[2].name)
def convert_space_depth(self, tf_op):
op = self.convert_general_op(tf_op)
if op.type == 'SpaceToDepth':
op.type = MaceOp.SpaceToDepth.name
else:
op.type = MaceOp.DepthToSpace.name
size_arg = op.arg.add()
size_arg.name = MaceKeyword.mace_space_depth_block_size_str
size_arg.i = tf_op.get_attr(tf_block_size)
def convert_pad(self, tf_op):
op = self.convert_general_op(tf_op)
op.type = MaceOp.Pad.name
del op.input[1:]
paddings_arg = op.arg.add()
paddings_arg.name = MaceKeyword.mace_paddings_str
paddings_value = tf_op.inputs[1].eval().astype(np.int32).flat
paddings_arg.ints.extend(paddings_value)
self._skip_tensor.update(tf_op.inputs[1].name)
if len(tf_op.inputs) == 3:
constant_value_arg = op.arg.add()
constant_value_arg.name = MaceKeyword.mace_constant_value_str
constant_value = tf_op.inputs[2].eval().astype(np.int32).flat[0]
constant_value_arg.i = constant_value
self._skip_tensor.update(tf_op.inputs[2].name)
def convert_concat(self, tf_op):
op = self.convert_general_op(tf_op)
op.type = MaceOp.Concat.name
del op.input[-1]
axis_arg = op.arg.add()
axis_arg.name = MaceKeyword.mace_axis_str
axis = tf_op.inputs[-1].eval().astype(np.int32)
axis_arg.i = axis
mace_check(axis == 3, "only support concat at channel dimension")
self._skip_tensor.update(tf_op.inputs[-1].name)
def convert_reshape(self, tf_op):
op = self.convert_general_op(tf_op)
op.type = MaceOp.Reshape.name
del op.input[1:]
shape_arg = op.arg.add()
shape_arg.name = MaceKeyword.mace_shape_str
shape_value = []
if tf_op.inputs[1].op.type == 'Const':
shape_value = list(tf_op.inputs[1].eval().astype(np.int32))
for i in xrange(len(shape_value)):
if shape_value[i] == -1:
shape_value[i] = 1
self._skip_tensor.update(tf_op.inputs[-1].name)
elif tf_op.inputs[1].op.type == 'Shape':
shape_value = list(tf_op.inputs[1].op.inputs[0].shape.as_list())
shape_arg.ints.extend(shape_value)
def convert_mean(self, tf_op):
op = self.convert_general_op(tf_op)
del op.input[1:]
reduce_dims = tf_op.inputs[1].eval()
mace_check(reduce_dims[0] == 1 and reduce_dims[1] == 2,
"Mean only support reduce dim 1, 2")
op.type = MaceOp.Pooling.name
pooling_type_arg = op.arg.add()
pooling_type_arg.name = MaceKeyword.mace_pooling_type_str
pooling_type_arg.i = PoolingType.AVG.value
padding_arg = op.arg.add()
padding_arg.name = MaceKeyword.mace_padding_str
padding_arg.i = PaddingMode.VALID.value
strides_arg = op.arg.add()
strides_arg.name = MaceKeyword.mace_strides_str
strides_arg.ints.extend([1, 1])
kernels_arg = op.arg.add()
kernels_arg.name = MaceKeyword.mace_kernel_str
kernels_arg.ints.extend(tf_op.inputs[0].shape.as_list()[1:3])
self._skip_tensor.add(tf_op.inputs[1].name)
import enum
import numpy as np
from mace.proto import mace_pb2
from mace.python.tools.converter_tool import base_converter
from mace.python.tools.converter_tool.base_converter import EltwiseType
from mace.python.tools.converter_tool.base_converter import ActivationType
from mace.python.tools.converter_tool.base_converter import PaddingMode
from mace.python.tools.converter_tool.base_converter import DataFormat
from mace.python.tools.converter_tool.base_converter import FilterFormat
from mace.python.tools.converter_tool.base_converter import MaceOp
from mace.python.tools.converter_tool.base_converter import MaceKeyword
from mace.python.tools.converter_tool.base_converter import ConverterUtil
from mace.python.tools.convert_util import mace_check
OPENCL_IMAGE_MAX_SIZE = 16384
class OpenCLBufferType(enum.Enum):
CONV2D_FILTER = 0
IN_OUT_CHANNEL = 1
ARGUMENT = 2
IN_OUT_HEIGHT = 3
IN_OUT_WIDTH = 4
WINOGRAD_FILTER = 5
DW_CONV2D_FILTER = 6
WEIGHT_HEIGHT = 7
WEIGHT_WIDTH = 8
class Transformer(base_converter.ConverterInterface):
"""A class for transform naive mace model to optimized model.
This Transformer should be platform irrelevant. So, do not assume
tensor name has suffix like ':0".
"""
def __init__(self, option, model):
# DO NOT reorder the following transformers
self._registered_transformers = [
self.remove_identity_op,
self.transform_global_pooling,
self.fold_softmax,
self.fold_batchnorm,
self.fold_conv_and_bn, # data_format related
self.fold_depthwise_conv_and_bn, # data_format related
self.transform_gpu_winograd, # data_format related
self.transform_add_to_biasadd,
self.fold_biasadd,
self.fold_activation,
self.transpose_filters,
self.transpose_data_format,
self.transform_global_conv_to_fc,
self.transform_buffer_image,
self.sort_by_execution,
]
self._option = option
self._model = model
self._ops = {}
self._consts = {}
self._consumers = {}
self._producer = {}
self._target_data_format = DataFormat.NHWC
if self._option.device == mace_pb2.CPU:
self._target_data_format = DataFormat.NCHW
def run(self):
for transformer in self._registered_transformers:
while True:
self.construct_ops_and_consumers()
changed = transformer()
if not changed:
break
return self._model
def filter_format(self):
filter_format_value = ConverterUtil.get_arg(self._model,
MaceKeyword.mace_filter_format_str).i # noqa
filter_format = None
if filter_format_value == FilterFormat.HWIO.value:
filter_format = FilterFormat.HWIO
elif filter_format_value == FilterFormat.OIHW.value:
filter_format = FilterFormat.OIHW
elif filter_format_value == FilterFormat.HWOI.value:
filter_format = FilterFormat.HWOI
else:
mace_check(False, "filter format %d not supported" %
filter_format_value)
return filter_format
def set_filter_format(self, filter_format):
arg = ConverterUtil.get_arg(self._model,
MaceKeyword.mace_filter_format_str)
arg.i = filter_format.value
def construct_ops_and_consumers(self):
self._ops.clear()
self._consumers.clear()
self._producer.clear()
for op in self._model.op:
self._ops[op.name] = op
for tensor in self._model.tensors:
self._consts[tensor.name] = tensor
for op in self._ops.values():
for input_tensor in op.input:
if input_tensor not in self._consumers:
self._consumers[input_tensor] = []
self._consumers[input_tensor].append(op)
for output_tensor in op.output:
self._producer[output_tensor] = op
for input_node in self._option.input_nodes.values():
op = mace_pb2.OperatorDef()
op.name = self.normalize_op_name(input_node.name)
op.type = 'Input'
op.output.extend(input_node.name)
output_shape = op.output_shape.add()
output_shape.dims.extend(input_node.shape)
if self._option.device == mace_pb2.CPU:
self.transpose_shape(output_shape.dims, [0, 3, 1, 2])
ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
else:
ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
self._producer[op.output[0]] = op
@staticmethod
def replace(obj_list, source, target):
for i in xrange(len(obj_list)):
if obj_list[i] == source:
obj_list[i] = target
@staticmethod
def transpose_shape(shape, order):
transposed_shape = []
for i in xrange(len(order)):
transposed_shape.append(shape[order[i]])
shape[:] = transposed_shape[:]
@staticmethod
def normalize_op_name(name):
return name.replace(':', '_')
def consumer_count(self, tensor_name):
return len(self._consumers.get(tensor_name, []))
def is_op_output_node(self, op):
output_node_tensor_names = [out for out in
self._option.output_nodes]
for output in op.output:
if output in output_node_tensor_names:
return True
return False
def replace_output_node(self, op):
"""if it is an output node, change output node to the op before it"""
if self.is_op_output_node(op):
real_output_node = self._producer[op.input[0]]
self.replace(real_output_node.output, op.input[0], op.output[0])
print("change %s to %s" % (real_output_node.name, op.name))
def remove_identity_op(self):
net = self._model
for op in net.op:
if op.type == 'Identity':
print("Remove identity: %s(%s)" % (op.name, op.type))
for consumer_op in self._consumers.get(op.output[0], []):
Transformer.replace(consumer_op.input, op.output[0],
op.input[0])
self.replace_output_node(op)
net.op.remove(op)
return True
return False
def transform_global_pooling(self):
net = self._model
for op in net.op:
if op.type == MaceOp.Pooling.name and \
ConverterUtil.get_arg(op,
MaceKeyword.mace_global_pooling_str) is not None: # noqa
print("Transform global pooling: %s(%s)" % (op.name, op.type))
input_shape = self._producer[op.input[0]].output_shape[0].dims
if ConverterUtil.data_format(op) == DataFormat.NHWC:
kernel_shape = input_shape[1:3]
else:
kernel_shape = input_shape[2:4]
ConverterUtil.get_arg(op,
MaceKeyword.mace_kernel_str).ints[:] \
= kernel_shape[:]
return False
def fold_batchnorm(self):
net = self._model
for op in net.op:
if (op.type == MaceOp.Eltwise.name
and ConverterUtil.get_arg(
op, MaceKeyword.mace_element_type_str).i
== EltwiseType.PROD.value) \
and len(op.input) == 2 \
and op.input[1] in self._consts \
and self.consumer_count(op.output[0]) == 1 \
and not self.is_op_output_node(op):
consumer_op = self._consumers[op.output[0]][0]
if (consumer_op.type == MaceOp.Eltwise.name
and ConverterUtil.get_arg(
op, MaceKeyword.mace_element_type_str).i
== EltwiseType.SUM.value
or consumer_op.type == MaceOp.BiasAdd.name) \
and len(consumer_op.input) == 2 \
and consumer_op.input[1] in self._consts \
and len(self._consts[consumer_op.input[1]].dims) == 1:
print("Fold batchnorm: %s(%s)" % (op.name, op.type))
consumer_op.type = MaceOp.FoldedBatchNorm.name
inputs = [op.input[0], op.input[1], consumer_op.input[1]]
consumer_op.input[:] = inputs[:]
net.op.remove(op)
return True
return False
def fold_conv_and_bn(self):
net = self._model
for op in net.op:
if (op.type == MaceOp.Conv2D.name
or op.type == MaceOp.Deconv2D.name) \
and self.consumer_count(op.output[0]) == 1:
consumer_op = self._consumers[op.output[0]][0]
if consumer_op.type == MaceOp.FoldedBatchNorm.name:
print("Fold conv and bn: %s(%s)" % (op.name, op.type))
filter = self._consts[op.input[1]]
scale = self._consts[consumer_op.input[1]]
idx = 0
filter_format = self.filter_format()
if filter_format == FilterFormat.HWIO:
for hwi in xrange(filter.dims[0] * filter.dims[1]
* filter.dims[2]):
for o in xrange(filter.dims[3]):
filter.float_data[idx] *= scale.float_data[o]
idx += 1
elif filter_format == FilterFormat.OIHW:
for o in xrange(filter.dims[0]):
for hwi in xrange(filter.dims[1] * filter.dims[2]
* filter.dims[3]):
filter.float_data[idx] *= scale.float_data[o]
idx += 1
else:
mace_check(False, "filter format %s not supported" %
filter_format)
# change BN to BiasAdd
consumer_op.type = MaceOp.BiasAdd.name
del consumer_op.input[1]
# remove scale tensor
net.tensors.remove(scale)
return True
return False
def fold_depthwise_conv_and_bn(self):
net = self._model
for op in net.op:
if op.type == MaceOp.DepthwiseConv2d.name \
and self.consumer_count(op.output[0]) == 1:
consumer_op = self._consumers[op.output[0]][0]
if consumer_op.type == MaceOp.FoldedBatchNorm.name:
print("Fold depthwise conv and bn: %s(%s)"
% (op.name, op.type))
filter = self._consts[op.input[1]]
scale = self._consts[consumer_op.input[1]]
idx = 0
filter_format = self.filter_format()
if filter_format == FilterFormat.HWIO:
for hw in xrange(filter.dims[0] * filter.dims[1]):
for i in xrange(filter.dims[2]):
for o in xrange(filter.dims[3]):
filter.float_data[idx] *= scale.float_data[
i * filter.dims[3] + o]
idx += 1
elif filter_format == FilterFormat.OIHW:
for o in xrange(filter.dims[0]):
for i in xrange(filter.dims[1]):
for hw in xrange(filter.dims[2]
* filter.dims[3]):
filter.float_data[idx] *= scale.float_data[
i * filter.dims[0] + o]
idx += 1
else:
mace_check(False, "filter format %s not supported" %
filter_format)
# change BN to BiasAdd
consumer_op.type = MaceOp.BiasAdd.name
del consumer_op.input[1]
# remove scale tensor
net.tensors.remove(scale)
return True
return False
@staticmethod
def sort_feature_map_shape(shape, data_format):
"""Return shape in NHWC order"""
batch = shape[0]
if data_format == DataFormat.NHWC:
height = shape[1]
width = shape[2]
channels = shape[3]
else:
height = shape[2]
width = shape[3]
channels = shape[1]
return batch, height, width, channels
@staticmethod
def sort_filter_shape(filter_shape, filter_format):
"""Return filter shape in HWIO order"""
if filter_format == FilterFormat.HWIO:
filter_height = filter_shape[0]
filter_width = filter_shape[1]
in_channels = filter_shape[2]
out_channels = filter_shape[3]
elif filter_format == FilterFormat.OIHW:
filter_height = filter_shape[2]
filter_width = filter_shape[3]
in_channels = filter_shape[1]
out_channels = filter_shape[0]
elif filter_format == FilterFormat.HWOI:
filter_height = filter_shape[0]
filter_width = filter_shape[1]
in_channels = filter_shape[3]
out_channels = filter_shape[2]
else:
mace_check(False, "filter format %s not supported" % filter_format)
return filter_height, filter_width, in_channels, out_channels
def check_if_gpu_use_winograd_conv(self, op):
if not self._option.winograd_enabled:
return False
if op.type != MaceOp.Conv2D.name:
return False
filter_shape = self._consts[op.input[1]].dims
output_shape = op.output_shape[0].dims
strides = ConverterUtil.get_arg(op, MaceKeyword.mace_strides_str).ints
dilations_arg = ConverterUtil.get_arg(op,
MaceKeyword.mace_dilations_str)
if dilations_arg is None:
dilations = [1, 1]
else:
dilations = dilations_arg.ints
filter_height, filter_width, in_channels, out_channels = \
Transformer.sort_filter_shape(filter_shape, self.filter_format())
batch, out_height, out_width, _ = Transformer.sort_feature_map_shape(
output_shape, ConverterUtil.data_format(op))
if filter_height != 3 or filter_width != 3 or strides[0] > 1 \
or strides[1] > 1 or dilations[0] > 1 or dilations[1] > 1:
return False
width = batch * ((out_height + 1) / 2) * ((out_width + 1) / 2)
return (16 * in_channels < OPENCL_IMAGE_MAX_SIZE) and \
(16 * out_channels < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
def transform_gpu_winograd(self):
"""Only gpu needs winograd transform."""
net = self._model
filter_format = self.filter_format()
if self._option.device == mace_pb2.GPU:
for op in net.op:
if op.type == MaceOp.Conv2D.name \
and self.check_if_gpu_use_winograd_conv(op):
print("Transform gpu winograd %s(%s)" % (op.name, op.type))
output_shape = op.output_shape[0].dims
filter = self._consts[op.input[1]]
filter_shape = filter.dims
data_format = ConverterUtil.data_format(op)
filter_height, filter_width, in_channels, out_channels = \
Transformer.sort_filter_shape(filter_shape,
filter_format)
batch, out_height, out_width, _ = \
Transformer.sort_feature_map_shape(output_shape,
data_format)
# Input transform
wt_op = net.op.add()
wt_op.name = op.name + '_input_transform'
wt_op.type = MaceOp.WinogradTransform.name
wt_op.input.extend([op.input[0]])
wt_op.output.extend([wt_op.name])
wt_output_shape = wt_op.output_shape.add()
wt_output_width = batch * (
(out_height + 1) / 2) * ((out_width + 1) / 2)
wt_output_shape.dims.extend(
[16, in_channels, wt_output_width, 1])
arg = wt_op.arg.add()
arg.name = 'T'
arg.i = self._option.data_type
if ConverterUtil.get_arg(op,
MaceKeyword.mace_padding_str) \
is not None:
padding_arg = wt_op.arg.add()
padding_arg.name = MaceKeyword.mace_padding_str
padding_arg.i = ConverterUtil.get_arg(op,
MaceKeyword.mace_padding_str).i # noqa
elif ConverterUtil.get_arg(op,
MaceKeyword.mace_padding_values_str) is not None: # noqa
padding_arg = wt_op.arg.add()
padding_arg.name = MaceKeyword.mace_padding_values_str
padding_arg.ints.extend(ConverterUtil.get_arg(
op, MaceKeyword.mace_padding_values_str).ints)
# MatMul
matmul_op = net.op.add()
matmul_op.name = op.name + '_matmul'
matmul_op.type = MaceOp.MatMul.name
matmul_op.input.extend([op.input[1], wt_op.output[0]])
matmul_op.output.extend([matmul_op.name])
matmul_output_shape = matmul_op.output_shape.add()
matmul_output_shape.dims.extend(
[16, out_channels, wt_output_width, 1])
arg = matmul_op.arg.add()
arg.name = 'T'
arg.i = self._option.data_type
arg = matmul_op.arg.add()
arg.name = MaceKeyword.mace_winograd_filter_transformed
arg.i = 1
# Inverse transform
iwt_op = net.op.add()
iwt_op.name = op.name + '_inverse_transform'
iwt_op.type = MaceOp.WinogradInverseTransform.name
iwt_op.input.extend([matmul_op.output[0]])
# biasadd
if len(op.input) >= 3:
iwt_op.input.extend([op.input[2]])
iwt_op.output.extend(op.output)
iwt_output_shape = iwt_op.output_shape.add()
iwt_output_shape.dims.extend(op.output_shape[0].dims)
arg = iwt_op.arg.add()
arg.name = 'T'
arg.i = self._option.data_type
batch_arg = iwt_op.arg.add()
batch_arg.name = 'batch'
batch_arg.i = batch
height_arg = iwt_op.arg.add()
height_arg.name = 'height'
height_arg.i = out_height
width_arg = iwt_op.arg.add()
width_arg.name = 'width'
width_arg.i = out_width
ConverterUtil.add_data_format_arg(iwt_op, data_format)
filter_data = np.array(filter.float_data).reshape(
filter.dims)
weight_tensor_value = filter_data
if filter_format == FilterFormat.HWIO:
weight_tensor_value = filter_data.transpose(3, 2, 0, 1)
elif filter_format == FilterFormat.HWOI:
weight_tensor_value = filter_data.transpose(2, 3, 0, 1)
filter.float_data[:] = weight_tensor_value.flat[:]
filter.dims[:] = weight_tensor_value.shape[:]
net.op.remove(op)
return False
def transform_add_to_biasadd(self):
net = self._model
for op in net.op:
if op.type == 'Add' \
and len(op.input) == 2 \
and op.input[1] in self._consts \
and len(self._consts[op.input[1]].dims) == 1:
print("Transform add to biasadd: %s(%s)" % (op.name, op.type))
op.type = MaceOp.BiasAdd.name
return True
return False
def fold_biasadd(self):
net = self._model
for op in net.op:
if ((op.type == MaceOp.Conv2D.name
or op.type == MaceOp.Deconv2D.name
or op.type == MaceOp.DepthwiseConv2d.name
or op.type == MaceOp.FullyConnected.name
or op.type == MaceOp.WinogradInverseTransform.name)
and len(op.input) == 2) \
and len(self._consumers.get(op.output[0], [])) == 1:
consumer_op = self._consumers[op.output[0]][0]
if consumer_op.type == MaceOp.BiasAdd.name:
print("Fold biasadd: %s(%s)" % (op.name, op.type))
op.name = consumer_op.name
op.input.append(consumer_op.input[1])
op.output[0] = consumer_op.output[0]
net.op.remove(consumer_op)
return True
return False
def fold_activation(self):
net = self._model
for op in net.op:
if (op.type == MaceOp.Conv2D.name
or op.type == MaceOp.Deconv2D.name
or op.type == MaceOp.DepthwiseConv2d.name
or op.type == MaceOp.FullyConnected.name
or op.type == MaceOp.FoldedBatchNorm.name
or op.type == MaceOp.WinogradInverseTransform.name) \
and len(self._consumers.get(op.output[0], [])) == 1:
consumer_op = self._consumers[op.output[0]][0]
if consumer_op.type == MaceOp.Activation.name \
and ConverterUtil.get_arg(
consumer_op,
MaceKeyword.mace_activation_type_str).s != 'PRELU':
print("Fold activation: %s(%s)" % (op.name, op.type))
op.name = consumer_op.name
op.output[0] = consumer_op.output[0]
for arg in consumer_op.arg:
if arg.name == MaceKeyword.mace_activation_type_str \
or arg.name == MaceKeyword.mace_activation_max_limit_str: # noqa
op.arg.extend([arg])
net.op.remove(consumer_op)
return True
return False
def transpose_data_format(self):
net = self._model
for op in net.op:
# transpose args
if op.type == MaceOp.Pad.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_paddings_str and len(
arg.ints) == 4:
if ConverterUtil.data_format(op) == DataFormat.NHWC \
and self._target_data_format == DataFormat.NCHW: # noqa
print("Transpose pad args: %s(%s)"
% (op.name, op.type))
self.transpose_shape(arg.ints, [0, 3, 1, 2])
elif ConverterUtil.data_format(op) == DataFormat.NCHW \
and self._target_data_format == DataFormat.NHWC: # noqa
print("Transpose pad args: %s(%s)"
% (op.name, op.type))
self.transpose_shape(arg.ints, [0, 2, 3, 1])
elif op.type == MaceOp.Concat.name or op.type == MaceOp.Slice.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_axis_str:
if ConverterUtil.data_format(op) == DataFormat.NHWC \
and self._target_data_format == DataFormat.NCHW: # noqa
print("Transpose slice args: %s(%s)"
% (op.name, op.type))
mace_check(arg.i == 3,
'only support concat at '
'channel dimension')
arg.i = 1
elif ConverterUtil.data_format(op) == DataFormat.NCHW \
and self._target_data_format == DataFormat.NHWC: # noqa
print("Transpose slice args: %s(%s)"
% (op.name, op.type))
mace_check(arg.i == 1,
"only support concat at "
"channel dimension")
arg.i = 3
# transpose op output shape
data_format = ConverterUtil.data_format(op)
if data_format is not None \
and data_format != self._target_data_format:
print("Transpose output shapes: %s(%s)" % (op.name, op.type))
if self._target_data_format == DataFormat.NHWC: # NCHW -> NHWC
for output_shape in op.output_shape:
if len(output_shape.dims) == 4:
self.transpose_shape(output_shape.dims,
[0, 2, 3, 1])
else: # NHWC -> NCHW
for output_shape in op.output_shape:
if len(output_shape.dims) == 4:
self.transpose_shape(output_shape.dims,
[0, 3, 1, 2])
ConverterUtil.get_arg(op,
MaceKeyword.mace_data_format_str).i = \
self._target_data_format.value
# transpose input/output
if self._target_data_format == DataFormat.NCHW:
print("Transpose input/output to NCHW")
for input_node in self._option.input_nodes.values():
new_input_name = MaceKeyword.mace_input_node_name \
+ '_' + input_node.name
op = net.op.add()
op.name = self.normalize_op_name(input_node.name)
op.type = MaceOp.Transpose.name
op.input.extend([new_input_name])
op.output.extend([input_node.name])
output_shape = op.output_shape.add()
output_shape.dims.extend(input_node.shape)
dims_arg = op.arg.add()
dims_arg.name = MaceKeyword.mace_dims_str
dims_arg.ints.extend([0, 3, 1, 2])
arg = op.arg.add()
arg.name = 'T'
arg.i = self._option.data_type
for output_node in self._option.output_nodes.values():
output_name = MaceKeyword.mace_output_node_name \
+ '_' + output_node.name
op = self._model.op.add()
op.name = self.normalize_op_name(output_name)
op.type = MaceOp.Transpose.name
op.input.extend([output_node.name])
op.output.extend([output_name])
output_shape = op.output_shape.add()
output_shape.dims.extend(
self._producer[output_node.name].output_shape[0].dims)
self.transpose_shape(output_shape.dims, [0, 2, 3, 1])
dims_arg = op.arg.add()
dims_arg.name = MaceKeyword.mace_dims_str
dims_arg.ints.extend([0, 2, 3, 1])
arg = op.arg.add()
arg.name = 'T'
arg.i = self._option.data_type
return False
def transpose_filters(self):
net = self._model
filter_format = self.filter_format()
# TODO(liyin/liuqi): remove this if-condition after combine cpu/gpu
if self._option.device == mace_pb2.CPU:
print("Transpose filters to OIHW")
# transpose filter to OIHW/MIHW for tensorflow (HWIO/HWIM)
if filter_format == FilterFormat.HWIO:
for op in net.op:
if op.type == MaceOp.Conv2D.name \
or op.type == MaceOp.Deconv2D.name \
or op.type == MaceOp.DepthwiseConv2d.name:
if ConverterUtil.get_arg(op,
MaceKeyword.mace_winograd_filter_transformed) is None: # noqa
filter = self._consts[op.input[1]]
filter_data = np.array(filter.float_data).reshape(
filter.dims)
filter_data = filter_data.transpose(3, 2, 0, 1)
filter.float_data[:] = filter_data.flat
filter.dims[:] = filter_data.shape
self.set_filter_format(FilterFormat.OIHW)
elif self._option.device == mace_pb2.GPU:
# TODO(liyin/liuqi): remove this whole logic after combine cpu/gpu
print("Transpose filters to HWOI/HWIM")
for op in net.op:
if op.type == MaceOp.Conv2D.name \
or op.type == MaceOp.Deconv2D.name \
or op.type == MaceOp.DepthwiseConv2d.name:
filter = self._consts[op.input[1]]
filter_data = np.array(filter.float_data).reshape(
filter.dims)
# transpose filter to HWOI/HWIM for
# tensorflow and caffe (OIHW/MIHW)
if filter_format == FilterFormat.HWIO \
and (op.type == MaceOp.Conv2D.name
or op.type == MaceOp.Deconv2D.name):
filter_data = filter_data.transpose(0, 1, 3, 2)
filter.float_data[:] = filter_data.flat
filter.dims[:] = filter_data.shape
elif filter_format == FilterFormat.OIHW:
if op.type == MaceOp.Conv2D.name \
or op.type == MaceOp.Deconv2D.name:
filter_data = filter_data.transpose(2, 3, 0, 1)
filter.float_data[:] = filter_data.flat
filter.dims[:] = filter_data.shape
elif op.type == MaceOp.Depthwiseconv2d.name:
filter_data = filter_data.transpose(2, 3, 1, 0)
filter.float_data[:] = filter_data.flat
filter.dims[:] = filter_data.shape
if op.type == MaceOp.FullyConnected.name:
weight = self._consts[op.input[1]]
input_shape = list(self._producer[op.input[0]]
.output_shape[0].dims)
weight_shape = [weight.dims[0]] + input_shape[1:]
# OCHW -> OHWC
weight_data = np.array(weight.float_data).reshape(
weight_shape)
weight_data = weight_data.transpose(0, 2, 3, 1)
weight.float_data[:] = weight_data.flat
self.set_filter_format(FilterFormat.HWOI)
return False
def buffer_to_image(self, op, input_idx, input_type):
net = self._model
input_name = op.input[input_idx]
op_def = net.op.add()
op_def.name = input_name.replace(':', '_') + "_b2i"
output_name = op_def.name
op_def.type = MaceKeyword.mace_buffer_to_image
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = MaceKeyword.mace_buffer_type
arg.i = input_type.value
arg = op_def.arg.add()
arg.name = MaceKeyword.mace_mode
arg.i = 0
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self._option.data_type
op.input[input_idx] = output_name
def transform_buffer_image(self):
if self._option.device != mace_pb2.GPU:
return False
print("Transform buffer to image")
net = self._model
for op in net.op:
if op.type == MaceOp.Conv2D.name \
or op.type == MaceOp.Deconv2D.name:
self.buffer_to_image(op, 1, OpenCLBufferType.CONV2D_FILTER)
if len(op.input) >= 3:
self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.DepthwiseConv2d.name:
self.buffer_to_image(op, 1, OpenCLBufferType.DW_CONV2D_FILTER)
if len(op.input) >= 3:
self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.BiasAdd.name:
self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.FoldedBatchNorm.name:
self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT)
self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT)
if len(op.input) >= 4:
self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.MatMul.name and \
ConverterUtil.get_arg(op,
MaceKeyword.mace_winograd_filter_transformed) is not None: # noqa
self.buffer_to_image(op, 0, OpenCLBufferType.WINOGRAD_FILTER)
elif op.type == MaceOp.WinogradInverseTransform.name \
and len(op.input) >= 2:
self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.FullyConnected.name:
self.buffer_to_image(op, 1, OpenCLBufferType.WEIGHT_WIDTH)
if len(op.input) >= 3:
self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT)
elif op.type == MaceOp.Activation.name:
if ConverterUtil.get_arg(op,
MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name: # noqa
self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT)
for input_node in self._option.input_nodes.values():
new_input_name = MaceKeyword.mace_input_node_name \
+ '_' + input_node.name
op_def = self._model.op.add()
op_def.name = self.normalize_op_name(input_node.name)
op_def.type = MaceKeyword.mace_buffer_to_image
op_def.input.extend([new_input_name])
op_def.output.extend([input_node.name])
output_shape = op_def.output_shape.add()
output_shape.dims.extend(input_node.shape)
arg = op_def.arg.add()
arg.name = MaceKeyword.mace_buffer_type
arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self._option.data_type
for output_node in self._option.output_nodes.values():
output_name = MaceKeyword.mace_output_node_name \
+ '_' + output_node.name
op_def = self._model.op.add()
op_def.name = self.normalize_op_name(output_name)
op_def.type = MaceKeyword.mace_image_to_buffer
op_def.input.extend([output_node.name])
op_def.output.extend([output_name])
output_shape = op_def.output_shape.add()
output_shape.dims.extend(output_node.shape)
arg = op_def.arg.add()
arg.name = MaceKeyword.mace_buffer_type
arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self._option.data_type
return False
def fold_softmax(self):
changed = False
net = self._model
for op in net.op:
if op.type == MaceOp.Softmax.name:
print("Fold softmax: %s(%s)" % (op.name, op.type))
if self.consumer_count(op.output[0]) == 1:
consumer = self._consumers[op.output[0]][0]
if consumer.type == MaceOp.Reshape.name:
shape = ConverterUtil.get_arg(consumer,
MaceKeyword.mace_shape_str).ints # noqa
del op.output_shape[0].dims[:]
op.output_shape[0].dims.extend(shape)
self.replace_output_node(consumer)
net.op.remove(consumer)
changed = True
producer = self._producer[op.input[0]]
if producer.type == MaceOp.Reshape.name:
op.input[0] = producer.input[0]
self.replace_output_node(producer)
net.op.remove(producer)
changed = True
if len(op.output_shape[0].dims) < 4:
shape = ([1, 1, 1, 1] + list(op.output_shape[0].dims))[-4:]
op.output_shape[0].dims[:] = shape[:]
changed = True
if changed:
return True
return False
def transform_global_conv_to_fc(self):
"""Transform global conv to fc should be placed after transposing
input/output and filter"""
if self._option.device == mace_pb2.GPU:
return False
net = self._model
for op in net.op:
if op.type == MaceOp.Conv2D.name:
producer = self._producer[op.input[0]]
input_shape = producer.output_shape[0].dims
batch, height, width, channels = self.sort_feature_map_shape(
input_shape, ConverterUtil.data_format(producer))
filter = self._consts[op.input[1]]
filter_shape = filter.dims
filter_height, filter_width, in_channels, out_channels = \
self.sort_filter_shape(filter_shape, self.filter_format())
zero_padding = True
padding_arg = ConverterUtil.get_arg(op,
MaceKeyword.mace_padding_str) # noqa
if padding_arg is not None:
if padding_arg.i != PaddingMode.VALID.value:
zero_padding = False
else:
padding_value_arg = ConverterUtil.get_arg(op,
MaceKeyword.mace_padding_values_str) # noqa
if padding_value_arg is not None:
if not all(v == 0 for v in padding_value_arg.ints):
zero_padding = False
if height == filter_height and width == filter_width \
and zero_padding:
print("transform global conv to fc %s(%s)"
% (op.name, op.type))
op.type = MaceOp.FullyConnected.name
filter.dims[:] = [out_channels,
in_channels * filter_width
* filter_height][:]
def sort_dfs(self, op, visited, sorted_nodes):
visited.update([op.name])
if len(op.input) > 0:
for input_tensor in op.input:
producer_op = self._producer.get(input_tensor, None)
if producer_op is None:
pass
elif producer_op.name not in visited:
self.sort_dfs(producer_op, visited, sorted_nodes)
sorted_nodes.append(op)
def sort_by_execution(self):
print("Sort by execution")
net = self._model
visited = set()
sorted_nodes = []
for output_node in self._option.output_nodes:
output_tensor = MaceKeyword.mace_output_node_name \
+ '_' + output_node
mace_check(output_tensor in self._producer,
"output_tensor %s not existed in model" % output_tensor)
self.sort_dfs(self._producer[output_tensor], visited, sorted_nodes)
del net.op[:]
net.op.extend(sorted_nodes)
return False
......@@ -129,7 +129,7 @@ class MemoryOptimizer(object):
self.idle_mem.remove(mem_id)
if mem_id == -1:
mem_id = self.total_mem_count
mem_id = self.mem_id_base() + self.total_mem_count
self.total_mem_count += 1
self.mem_block[mem_id] = op_mem_block
......@@ -147,10 +147,13 @@ class MemoryOptimizer(object):
self.add_net_mem_blocks()
print('total op: %d', len(self.net_def.op))
print('origin mem: %d, optimized mem: %d',
print("total op: %d" % len(self.net_def.op))
print("origin mem: %d, optimized mem: %d" % (
self.get_total_origin_mem_size(),
self.get_total_optimized_mem_size())
self.get_total_optimized_mem_size()))
def mem_id_base(self):
return 0
class GPUMemoryOptimizer(MemoryOptimizer):
......@@ -189,6 +192,9 @@ class GPUMemoryOptimizer(MemoryOptimizer):
block.x = self.mem_block[mem][0]
block.y = self.mem_block[mem][1]
def mem_id_base(self):
return 20000
def optimize_gpu_memory(net_def):
mem_optimizer = GPUMemoryOptimizer(net_def)
......
......@@ -84,11 +84,20 @@ def obfuscate_name(net_def):
op.output[i] = in_out_map[op.output[i]]
def normalize_op_name(op_name):
idx = op_name.rfind(':')
if idx == -1:
return op_name
else:
return op_name[:idx]
def rename_tensor(net_def):
tensor_map = {}
for t in net_def.tensors:
if t.name not in tensor_map:
tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_")
tensor_map[t.name] = "_" + normalize_op_name(t.name).replace("/",
"_")
t.name = tensor_map[t.name]
for op in net_def.op:
for i in range(len(op.input)):
......@@ -118,6 +127,8 @@ class TensorInfo:
elif t.data_type == mace_pb2.DT_UINT8:
self.data = bytearray(
np.array(t.int32_data).astype(np.uint8).tolist())
else:
raise Exception('Tensor data type %s not supported' % t.data_type)
def stringfy(value):
......
# Copyright 2018 Xiaomi, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from mace.proto import mace_pb2
import tensorflow as tf
import numpy as np
import math
import copy
from tensorflow import gfile
from mace.python.tools import memory_optimizer
from tensorflow.core.framework import graph_pb2
from tensorflow.core.framework import tensor_shape_pb2
padding_mode = {'VALID': 0, 'SAME': 1, 'FULL': 2}
pooling_type_mode = {'AvgPool': 1, 'MaxPool': 2}
# the order should be the same as
# eltwise type's in mace/kernels/eltwise.h
# and also cwise type's in mace/kernels/cwise.h
# cuz these math ops should have compatible with "EltWise" and "CWise"
math_type_mode = {
'ADD': 0,
'SUB': 1,
'MUL': 2,
'DIV': 3,
'MIN': 4,
'MAX': 5,
'NEG': 6,
'ABS': 7,
'SQR_DIFF': 8,
'POW': 9,
}
buffer_type_map = {
'CONV2D_FILTER': 0,
'IN_OUT_CHANNEL': 1,
'ARGUMENT': 2,
'IN_OUT_HEIGHT': 3,
'IN_OUT_WIDTH': 4,
'WINOGRAD_FILTER': 5,
'DW_CONV2D_FILTER': 6,
}
data_type_map = {'DT_HALF': mace_pb2.DT_HALF, 'DT_FLOAT': mace_pb2.DT_FLOAT}
activation_name_map = {
'Relu': 'RELU',
'Sigmoid': 'SIGMOID',
'Tanh': 'TANH',
'Relu6': 'RELUX'
}
BATCH_NORM_ORDER = ["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"]
MACE_INPUT_NODE_NAME = "mace_input_node"
MACE_OUTPUT_NODE_NAME = "mace_output_node"
OPENCL_IMAGE_MAX_SIZE = 16384
def get_input_tensor(op, index):
input_tensor = op.inputs[index]
if input_tensor.op.type == 'Reshape':
input_tensor = get_input_tensor(input_tensor.op, 0)
return input_tensor
class TFConverter(object):
def __init__(self, graph, tf_ops, net_def, dt, device, winograd):
self.graph = graph
self.net_def = net_def
self.tf_ops = tf_ops
self.dt = dt
self.device = device
self.winograd = winograd
self.tf_graph = {}
self.tf_parents = {}
self.resolved_ops = {}
self.unused_tensor = set()
self.transpose_filter_tensor = {}
self.reshape_tensor = {}
self.ops = {}
for op in tf_ops:
self.ops[op.name] = op
for op in tf_ops:
self.resolved_ops[op.name] = 0
for input in op.inputs:
input_name = input.name[:-2]
if input_name not in self.tf_graph:
self.tf_graph[input_name] = []
self.tf_graph[input_name].append(op)
if op.name not in self.tf_parents:
self.tf_parents[op.name] = []
self.tf_parents[op.name].append(self.ops[input_name])
def add_buffer_to_image(self, input_name, input_type):
output_name = input_name[:-2] + "_b2i" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'BufferToImage'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'mode'
arg.i = 0
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_image_to_buffer(self, input_name, input_type):
output_name = input_name[:-2] + "_i2b" + input_name[-2:]
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
return output_name
def add_gpu_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'BufferToImage'
op_def.input.extend([new_input_name])
op_def.output.extend([name + ':0'])
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
self.add_output_shape(self.ops[name].outputs, op_def)
def add_cpu_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'Transpose'
op_def.input.extend([new_input_name])
op_def.output.extend([name + ':0'])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 3, 1, 2])
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
self.add_output_shape(self.ops[name].outputs, op_def)
def add_gpu_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([name + ':0'])
op_def.output.extend([output_name])
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
def add_cpu_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'Transpose'
op_def.input.extend([name + ':0'])
op_def.output.extend([output_name])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 2, 3, 1])
output_shapes = []
for output in self.ops[name].outputs:
old_shape = output.shape.as_list()
# NCHW -> NHWC
if len(old_shape) == 2:
new_shape = [old_shape[0], 1, 1, old_shape[1]]
else:
new_shape = [old_shape[0], old_shape[2],
old_shape[3], old_shape[1]]
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(new_shape)
output_shapes.append(output_shape)
op_def.output_shape.extend(output_shapes)
def add_output_shape(self, outputs, op):
output_shapes = []
for output in outputs:
old_shape = []
if isinstance(output, list):
old_shape = output
elif isinstance(output, tf.Tensor):
if output.shape.num_elements() is not None:
old_shape = output.shape.as_list()
else:
raise ValueError('output type not supported: ', type(output))
if len(old_shape) == 2:
old_shape = [old_shape[0], old_shape[1], 1, 1]
if self.device == 'cpu': # NHWC -> NCHW
old_shape = [old_shape[0], old_shape[3],
old_shape[1], old_shape[2]]
output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(old_shape)
output_shapes.append(output_shape)
op.output_shape.extend(output_shapes)
def add_tensor(self, name, shape, tf_dt, value):
tensor = self.net_def.tensors.add()
tensor.name = name
shape = list(shape)
tensor.dims.extend(shape)
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(value.flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(value.flat)
else:
raise Exception("Not supported tensor type: " + tf_dt.name)
def convert_reshape(self, op):
input_tensor = get_input_tensor(op, 0)
shape_tensor = get_input_tensor(op, 1)
shape_value = shape_tensor.eval().astype(np.int32)
self.unused_tensor.add(shape_tensor.name)
self.reshape_tensor[input_tensor.name] = shape_value
self.resolved_ops[op.name] = 1
def convert_tensor(self, op):
output_name = op.outputs[0].name
if output_name not in self.unused_tensor:
tensor = self.net_def.tensors.add()
tf_tensor = op.outputs[0].eval()
if output_name in self.transpose_filter_tensor:
tf_tensor = tf_tensor.transpose(
self.transpose_filter_tensor[output_name])
if output_name in self.reshape_tensor:
tf_tensor = tf_tensor.reshape(self.reshape_tensor[output_name])
tensor.name = op.outputs[0].name
shape = list(tf_tensor.shape)
tensor.dims.extend(shape)
tf_dt = op.get_attr('dtype')
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
else:
raise Exception("Not supported tensor type: " + tf_dt.name)
self.resolved_ops[op.name] = 1
def check_winograd_conv(self, op):
filter_shape = get_input_tensor(op, 1).shape.as_list()
strides = op.get_attr('strides')[1:3]
output_shape = op.outputs[0].shape.as_list()
if len(output_shape) == 0 or output_shape[0] is None:
return False
width = output_shape[0] * ((output_shape[1] + 1) / 2) * ((
output_shape[2] + 1) / 2)
if self.winograd and op.type != 'DepthwiseConv2dNative' and \
filter_shape[0] == 3 and \
(filter_shape[0] == filter_shape[1]) and \
(strides[0] == 1) and (strides[0] == strides[1]):
if self.device == 'gpu':
return (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
(16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
(width < OPENCL_IMAGE_MAX_SIZE)
elif self.device == 'cpu':
return filter_shape[2] >= 8 and filter_shape[3] >= 8
return False
def convert_winograd_conv_gpu(self, op):
filter_tensor = get_input_tensor(op, 1)
filter_shape = filter_tensor.shape.as_list()
output_shape = op.outputs[0].shape.as_list()
self.transpose_filter_tensor[filter_tensor.name] = (3, 2, 0, 1)
filter_name = self.add_buffer_to_image(op.inputs[1].name,
"WINOGRAD_FILTER")
# Input transform
wt_op = mace_pb2.OperatorDef()
arg = wt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
padding_arg = wt_op.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
wt_op.name = op.name + '_input_transform'
wt_op.type = 'WinogradTransform'
wt_op.input.extend([op.inputs[0].name])
wt_output_name = wt_op.name + ":0"
wt_op.output.extend([wt_output_name])
wt_output_shape = mace_pb2.OutputShape()
wt_output_width = output_shape[0] * ((output_shape[1] + 1) / 2) * ((
output_shape[2] + 1) / 2)
wt_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
wt_op.output_shape.extend([wt_output_shape])
# MatMul
matmul_op = mace_pb2.OperatorDef()
arg = matmul_op.arg.add()
arg.name = 'T'
arg.i = self.dt
matmul_op.name = op.name + '_matmul'
matmul_op.type = 'MatMul'
matmul_op.input.extend([filter_name, wt_output_name])
matmul_output_name = matmul_op.name + ":0"
matmul_op.output.extend([matmul_output_name])
matmul_output_shape = mace_pb2.OutputShape()
matmul_output_shape.dims.extend(
[16, filter_shape[3], wt_output_width, 1])
matmul_op.output_shape.extend([matmul_output_shape])
# Inverse transform
iwt_op = mace_pb2.OperatorDef()
arg = iwt_op.arg.add()
arg.name = 'T'
arg.i = self.dt
batch_arg = iwt_op.arg.add()
batch_arg.name = 'batch'
batch_arg.i = output_shape[0]
height_arg = iwt_op.arg.add()
height_arg.name = 'height'
height_arg.i = output_shape[1]
width_arg = iwt_op.arg.add()
width_arg.name = 'width'
width_arg.i = output_shape[2]
iwt_op.name = op.name + '_inverse_transform'
iwt_op.type = 'WinogradInverseTransform'
iwt_op.input.extend([matmul_output_name])
final_op = op
self.resolved_ops[op.name] = 1
if len(self.tf_graph[op.name]
) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd':
bias_add_op = self.tf_graph[op.name][0]
output_name = self.add_buffer_to_image(
get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
iwt_op.input.extend([output_name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 and \
self.tf_graph[final_op.name][0].type in activation_name_map:
activation_op = self.tf_graph[final_op.name][0]
fused_act_arg = iwt_op.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
max_limit_arg = iwt_op.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
iwt_op.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, iwt_op)
self.net_def.op.extend([wt_op, matmul_op, iwt_op])
def convert_conv_winograd_filter_cpu(self, op, op_def):
weight_tensor = get_input_tensor(op, 1)
weight_tensor_value = weight_tensor.eval().astype(np.float32)
input_shape = get_input_tensor(op, 0).shape.as_list()
output_channels = weight_tensor_value.shape[3]
input_channels = weight_tensor_value.shape[2]
# HWIO -> OIHW
weight_tensor_value = weight_tensor_value.transpose(3, 2, 0, 1)
if input_shape[1] > 16 and input_shape[2] > 16:
G = np.array([
[1.0, 0.0, 0.0],
[-2.0 / 9, -2.0 / 9, -2.0 / 9],
[-2.0 / 9, 2.0 / 9, -2.0 / 9],
[1.0 / 90, 1.0 / 45, 2.0 / 45],
[1.0 / 90, -1.0 / 45, 2.0 / 45],
[1.0 / 45, 1.0 / 90, 1.0 / 180],
[1.0 / 45, -1.0 / 90, 1.0 / 180],
[0.0, 0.0, 1.0]
], dtype=np.float32)
new_shape = [64, output_channels, input_channels] # TOC
else:
G = np.array([
[1.0, 0.0, 0.0],
[0.5, 0.5, 0.5],
[0.5, -0.5, 0.5],
[0.0, 0.0, 1.0],
], dtype=np.float32)
new_shape = [16, output_channels, input_channels] # TOC
new_weight_value = G.dot(weight_tensor_value).dot(G.T) # [t, O, I, t]
new_weight_value = new_weight_value.transpose(0, 3, 1, 2)
new_weight_value = new_weight_value.reshape(new_shape)
new_tensor_name = weight_tensor.name[:-2] + '/winograd_transformed:0'
self.add_tensor(new_tensor_name, new_shape,
tf.float32, new_weight_value)
winograd_transformed_arg = op_def.arg.add()
winograd_transformed_arg.name = 'is_filter_transformed'
winograd_transformed_arg.i = 1
self.unused_tensor.add(weight_tensor.name)
op_def.input.extend([op.inputs[0].name])
op_def.input.extend([new_tensor_name])
def convert_conv2d(self, op):
use_winograd = False
if self.device == 'cpu':
use_winograd = self.check_winograd_conv(op)
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
if op.type == 'DepthwiseConv2dNative':
op_def.type = 'DepthwiseConv2d'
else:
op_def.type = op.type
if self.device == 'cpu' and not use_winograd:
self.transpose_filter_tensor[get_input_tensor(
op, 1).name] = (3, 2, 0, 1)
elif op.type == 'Conv2D':
self.transpose_filter_tensor[get_input_tensor(
op, 1).name] = (0, 1, 3, 2)
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
if op_def.type == 'DepthwiseConv2d':
buffer_type = "DW_CONV2D_FILTER"
else:
buffer_type = "CONV2D_FILTER"
output_name = self.add_buffer_to_image(
get_input_tensor(op, 1).name, buffer_type)
op_def.input.extend([output_name])
elif self.device == 'cpu' and use_winograd:
self.convert_conv_winograd_filter_cpu(op, op_def)
else:
op_def.input.extend(
[get_input_tensor(op, i).name for i in range(len(op.inputs))])
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(op.get_attr('strides')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'cpu':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
final_op = op
self.resolved_ops[op.name] = 1
if len(self.tf_graph.get(op.name, [])) == 1 and \
self.tf_graph[op.name][0].type == 'BiasAdd' or \
(len(self.tf_graph[op.name]) == 1 and
self.tf_graph[op.name][0].type == 'Add' and
len(self.tf_graph[op.name][0].inputs) == 2 and
len(self.graph.get_tensor_by_name(
self.tf_graph[op.name][0].inputs[1].name).shape) == 1):
bias_add_op = self.tf_graph[op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(
get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph.get(final_op.name, [])) == 1 and \
self.tf_graph[final_op.name][0].type in activation_name_map:
activation_op = self.tf_graph[final_op.name][0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
def convert_deconv2d(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Deconv2D'
out_shape_value = None
if len(op.inputs) == 2:
out_shape_value = op.get_attr('output_shape')
if self.device == 'cpu':
self.transpose_filter_tensor[get_input_tensor(
op, 1).name] = (3, 2, 0, 1)
else:
self.transpose_filter_tensor[get_input_tensor(
op, 1).name] = (0, 1, 3, 2)
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
buffer_type = "CONV2D_FILTER"
output_name = self.add_buffer_to_image(
get_input_tensor(op, 1).name, buffer_type)
op_def.input.extend([output_name])
else:
op_def.input.extend(
[get_input_tensor(op, i).name
for i in range(len(op.inputs))])
elif len(op.inputs) == 3:
out_shape_value = \
get_input_tensor(op, 0).eval().astype(np.int32).flat
self.unused_tensor.add(op.inputs[0].name)
if self.device == 'cpu':
self.transpose_filter_tensor[get_input_tensor(
op, 1).name] = (2, 3, 0, 1)
else:
self.transpose_filter_tensor[get_input_tensor(
op, 1).name] = (0, 1, 2, 3)
if self.device == 'gpu':
op_def.input.extend([op.inputs[2].name])
buffer_type = "CONV2D_FILTER"
output_name = self.add_buffer_to_image(
get_input_tensor(op, 1).name, buffer_type)
op_def.input.extend([output_name])
else:
op_def.input.extend([op.inputs[2].name])
op_def.input.extend([op.inputs[1].name])
else:
raise Exception('Too many inputs. Op: %s, type: %s' % (op.name,
op.type))
if out_shape_value is not None:
out_shape_arg = op_def.arg.add()
out_shape_arg.name = 'output_shape'
out_shape_arg.ints.extend(out_shape_value)
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(op.get_attr('strides')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'cpu':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
final_op = op
self.resolved_ops[op.name] = 1
if len(self.tf_graph.get(op.name, [])) == 1 and \
self.tf_graph[op.name][0].type == 'BiasAdd':
bias_add_op = self.tf_graph[op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(
get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph.get(final_op.name, [])) == 1 and \
self.tf_graph[final_op.name][0].type in activation_name_map:
activation_op = self.tf_graph[final_op.name][0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
def check_conv_to_fc(self, op):
if self.device != 'cpu' or op.type != "Conv2D":
return False
filter_shape = get_input_tensor(op, 1).shape.as_list()
input_shape = get_input_tensor(op, 0).shape.as_list()
return input_shape[1] == filter_shape[0] \
and input_shape[2] == filter_shape[1] \
and (op.get_attr('padding') == 'VALID' or filter_shape[0] == 1
and filter_shape[1] == 1)
def convert_global_conv_to_fc(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'FC'
self.transpose_filter_tensor[get_input_tensor(op, 1).name] = \
(3, 2, 0, 1)
filter_shape = get_input_tensor(op, 1).shape.as_list()
self.reshape_tensor[get_input_tensor(op, 1).name] = \
[filter_shape[3],
filter_shape[2] * filter_shape[1] * filter_shape[0], 1, 1]
op_def.input.extend(
[get_input_tensor(op, i).name for i in range(len(op.inputs))])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NCHW'
final_op = op
self.resolved_ops[op.name] = 1
if len(self.tf_graph.get(op.name, [])) == 1 and \
self.tf_graph[op.name][0].type == 'BiasAdd':
bias_add_op = self.tf_graph[op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(
get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph.get(final_op.name, [])) == 1 and \
self.tf_graph[final_op.name][0].type in activation_name_map:
activation_op = self.tf_graph[final_op.name][0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
def convert_fused_batchnorm(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'cpu':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
op_def.name = op.name
op_def.type = 'FoldedBatchNorm'
gamma_tensor = get_input_tensor(op, 1)
for i in range(1, 5):
input_tensor = get_input_tensor(op, i)
assert input_tensor.shape == gamma_tensor.shape
self.unused_tensor.add(input_tensor.name)
gamma_value = get_input_tensor(op, 1).eval().astype(np.float32)
beta_value = get_input_tensor(op, 2).eval().astype(np.float32)
mean_value = get_input_tensor(op, 3).eval().astype(np.float32)
var_value = get_input_tensor(op, 4).eval().astype(np.float32)
epsilon_value = op.get_attr('epsilon')
scale_value = ((1.0 / np.vectorize(math.sqrt)
(var_value + epsilon_value)) * gamma_value)
offset_value = (-mean_value * scale_value) + beta_value
idx = gamma_tensor.name.rfind('/')
name_prefix = gamma_tensor.name[:idx] + '/'
input_names = [name_prefix + 'scale:0', name_prefix + 'offset:0']
self.add_tensor(input_names[0], gamma_value.shape, gamma_tensor.dtype,
scale_value)
self.add_tensor(input_names[1], gamma_value.shape, gamma_tensor.dtype,
offset_value)
op_def.input.extend([op.inputs[0].name])
if self.device == 'gpu':
for name in input_names:
output_name = self.add_buffer_to_image(name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([name for name in input_names])
self.resolved_ops[op.name] = 1
final_op = op
if len(self.tf_graph[op.name]) == 1 \
and self.tf_graph[op.name][0].type in activation_name_map:
activation_op = self.tf_graph[op.name][0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
op_def.output.extend([final_op.outputs[0].name])
self.add_output_shape([final_op.outputs[0]], op_def)
self.net_def.op.extend([op_def])
def convert_batchnorm(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'cpu':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
op_def.name = op.name
op_def.type = 'FoldedBatchNorm'
add_op = self.tf_graph[op.name][0]
scale_tensor = get_input_tensor(op, 1)
offset_tensor = get_input_tensor(add_op, 1)
input_names = [scale_tensor.name, offset_tensor.name]
op_def.input.extend([op.inputs[0].name])
if self.device == 'gpu':
for name in input_names:
output_name = self.add_buffer_to_image(name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([name for name in input_names])
self.resolved_ops[op.name] = 1
self.resolved_ops[add_op.name] = 1
final_op = add_op
if len(self.tf_graph[op.name]) == 1 \
and self.tf_graph[op.name][0].type in activation_name_map:
activation_op = self.tf_graph[op.name][0]
fused_act_arg = op_def.arg.add()
fused_act_arg.name = 'activation'
fused_act_arg.s = activation_name_map[activation_op.type]
if activation_op.type == 'Relu6':
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
final_op = activation_op
self.resolved_ops[activation_op.name] = 1
op_def.output.extend([final_op.outputs[0].name])
self.add_output_shape([final_op.outputs[0]], op_def)
self.net_def.op.extend([op_def])
def convert_pooling(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Pooling'
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[op.type]
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend(op.get_attr('strides')[1:3])
kernels_arg = op_def.arg.add()
kernels_arg.name = 'kernels'
kernels_arg.ints.extend(op.get_attr('ksize')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'cpu':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
def convert_global_avg_pooling(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Pooling'
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode['AvgPool']
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_arg.i = padding_mode['VALID']
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend([1, 1])
kernels_arg = op_def.arg.add()
kernels_arg.name = 'kernels'
kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'cpu':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
def convert_activation(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Activation'
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = activation_name_map[op.type]
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_relu6(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Activation'
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
activation_arg = op_def.arg.add()
activation_arg.name = 'activation'
activation_arg.s = "RELUX"
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
self.resolved_ops[op.name] = 1
def convert_add(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "AddN"
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_concat(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "Concat"
op_def.input.extend([input.name for input in op.inputs[:-1]])
op_def.output.extend([output.name for output in op.outputs])
axis_arg = op_def.arg.add()
axis_arg.name = 'axis'
axis = get_input_tensor(op, len(op.inputs) - 1).eval().astype(np.int32)
if self.device == 'cpu' and axis == 3:
axis = 1
axis_arg.i = axis
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, len(op.inputs) - 1).name)
def convert_resize_bilinear(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "ResizeBilinear"
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'size'
size_arg.ints.extend(
get_input_tensor(op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add()
size_arg.name = 'align_corners'
size_arg.i = op.get_attr('align_corners')
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 1).name)
def convert_eltwise(self, op, math_type):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "Eltwise"
if len(op.inputs) == 2:
input_tensor0 = get_input_tensor(op, 0)
input_tensor1 = get_input_tensor(op, 1)
x_value = None
if np.asarray(input_tensor1.shape).size == 0:
x_value = input_tensor1.eval()
op_def.input.extend([op.inputs[0].name])
self.unused_tensor.add(input_tensor1.name)
elif np.asarray(input_tensor0.shape).size == 0:
x_value = input_tensor0.eval()
op_def.input.extend([op.inputs[1].name])
self.unused_tensor.add(input_tensor0.name)
else:
if np.asarray(input_tensor0.shape).size == 1 \
and input_tensor0.op.type == 'Const':
if self.device == 'gpu':
output_name = self.add_buffer_to_image(
input_tensor0.name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([input_tensor0.name])
if np.asarray(input_tensor1.shape).size == 1 \
and input_tensor1.op.type == 'Const':
if self.device == 'gpu':
output_name = self.add_buffer_to_image(
input_tensor1.name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([input_tensor1.name])
if x_value is not None:
x_arg = op_def.arg.add()
x_arg.name = 'x'
x_arg.f = x_value
else:
op_def.input.extend([input.name for input in op.inputs])
type_arg = op_def.arg.add()
type_arg.name = 'type'
type_arg.i = math_type_mode[math_type]
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_depth_to_space(self, op, d2s):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'block_size'
size_arg.i = op.get_attr('block_size')
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_bias_add(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "BiasAdd"
op_def.input.extend([op.inputs[0].name])
if self.device == 'gpu':
output_name = self.add_buffer_to_image(
get_input_tensor(op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(op, 1).name])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.net_def.op.extend([op_def])
self.resolved_ops[op.name] = 1
def convert_space_to_batch(self, op, b2s):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add()
size_arg.name = 'block_shape'
size_arg.ints.extend(
get_input_tensor(op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add()
if b2s:
size_arg.name = 'crops'
else:
size_arg.name = 'paddings'
size_arg.ints.extend(
get_input_tensor(op, 2).eval().astype(np.int32).flat)
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.unused_tensor.add(get_input_tensor(op, 2).name)
def is_atrous_conv2d(self, op):
return op.type == 'SpaceToBatchND' and \
len(self.tf_graph[op.name]) == 1 and \
(self.tf_graph[op.name][0].type == 'Conv2D'
or self.tf_graph[op.name][0].type == 'DepthwiseConv2dNative')
def convert_atrous_conv2d(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
conv_op = self.tf_graph[op.name][0]
op_def.name = conv_op.name
if conv_op.type == 'DepthwiseConv2dNative':
op_def.type = 'DepthwiseConv2d'
else:
op_def.type = conv_op.type
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
if op_def.type == 'DepthwiseConv2d':
buffer_type = "DW_CONV2D_FILTER"
else:
self.transpose_filter_tensor[get_input_tensor(
conv_op, 1).name] = (0, 1, 3, 2)
buffer_type = "CONV2D_FILTER"
output_name = self.add_buffer_to_image(
get_input_tensor(conv_op, 1).name, buffer_type)
op_def.input.extend([output_name])
else:
self.transpose_filter_tensor[get_input_tensor(
conv_op, 1).name] = (3, 2, 0, 1)
op_def.input.extend([get_input_tensor(op, 0).name])
op_def.input.extend([get_input_tensor(conv_op, 1).name])
dilation_arg = op_def.arg.add()
dilation_arg.name = 'dilations'
dilation_arg.ints.extend(
get_input_tensor(op, 1).eval().astype(np.int32).flat)
padding_arg = op_def.arg.add()
padding_arg.name = 'padding'
padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat
if len(padding_values) > 0 and padding_values[0] > 0:
padding_arg.i = padding_mode['SAME']
else:
padding_arg.i = padding_mode['VALID']
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.unused_tensor.add(get_input_tensor(op, 2).name)
strides_arg = op_def.arg.add()
strides_arg.name = 'strides'
strides_arg.ints.extend([1, 1])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
if self.device == 'cpu':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
final_op = conv_op
self.resolved_ops[op.name] = 1
self.resolved_ops[conv_op.name] = 1
if len(self.tf_graph[final_op.name]
) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd':
bias_add_op = self.tf_graph[final_op.name][0]
if self.device == 'gpu':
output_name = self.add_buffer_to_image(
get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
final_op = bias_add_op
self.resolved_ops[bias_add_op.name] = 1
if len(self.tf_graph[final_op.name]) == 1 and \
self.tf_graph[final_op.name][0].type == 'BatchToSpaceND':
final_op = self.tf_graph[final_op.name][0]
self.resolved_ops[final_op.name] = 1
self.unused_tensor.add(get_input_tensor(final_op, 1).name)
self.unused_tensor.add(get_input_tensor(final_op, 2).name)
else:
raise Exception('Convert atrous conv error: no BatchToSpaceND op')
if len(self.tf_graph[final_op.name]) == 1 and \
self.tf_graph[final_op.name][0].type == 'Relu':
relu_op = self.tf_graph[final_op.name][0]
fused_relu_arg = op_def.arg.add()
fused_relu_arg.name = 'activation'
fused_relu_arg.s = "RELU"
final_op = relu_op
self.resolved_ops[relu_op.name] = 1
op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
def is_softmax(self, op):
return op.type == 'Softmax' and \
len(self.tf_parents[op.name]) == 1 and \
self.tf_parents[op.name][0].type == 'Reshape' and \
len(self.tf_graph[op.name]) == 1 and \
self.tf_graph[op.name][0].type == 'Reshape'
def convert_softmax(self, softmax_op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
# deal with first Reshape op
parent_reshape_op = self.tf_parents[softmax_op.name][0]
self.unused_tensor.add(get_input_tensor(parent_reshape_op, 1).name)
self.resolved_ops[parent_reshape_op.name] = 1
# FIXME: hardcode for inception_v3
# remove squeeze if exist
squeeze_op = self.tf_parents[parent_reshape_op.name][0]
if squeeze_op.type == 'Squeeze':
op_def.input.extend([squeeze_op.inputs[0].name])
self.resolved_ops[squeeze_op.name] = 1
# remove shape if exist
children_ops = self.tf_graph[squeeze_op.name]
print children_ops
if len(children_ops) > 1 and children_ops[0].type == 'Shape':
self.unused_tensor.add(
get_input_tensor(children_ops[1], 0).name)
self.resolved_ops[children_ops[1].name] = 1
else:
op_def.input.extend([parent_reshape_op.inputs[0].name])
# deal with Softmax op
op_def.name = softmax_op.name
op_def.type = softmax_op.type
self.resolved_ops[softmax_op.name] = 1
# deal with last Reshape op
reshape_op = self.tf_graph[softmax_op.name][0]
self.unused_tensor.add(get_input_tensor(reshape_op, 1).name)
shape = [dim.value for dim in reshape_op.outputs[0].shape]
if len(shape) == 2:
shape = [1, 1, shape[0], shape[1]]
op_def.output.extend([output.name for output in reshape_op.outputs])
self.add_output_shape([shape], op_def)
self.resolved_ops[reshape_op.name] = 1
def convert_pad(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "Pad"
op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
paddings_arg = op_def.arg.add()
paddings_arg.name = 'paddings'
if self.device == 'gpu':
paddings_value = get_input_tensor(op, 1).eval().astype(np.int32)
else:
paddings_value = get_input_tensor(op, 1).eval().astype(np.int32)
paddings_value = paddings_value[[0, 3, 1, 2]]
paddings_arg.ints.extend(paddings_value.flat)
self.unused_tensor.add(get_input_tensor(op, 1).name)
if len(op.inputs) == 3:
constant_value_arg = op_def.arg.add()
constant_value_arg.name = 'constant_value'
constant_value_arg.i = \
get_input_tensor(op, 2).eval().astype(np.int32).flat[0]
self.unused_tensor.add(get_input_tensor(op, 2).name)
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert_normal_op(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert(self, input_nodes, output_nodes):
if self.device == 'gpu':
self.add_gpu_input_transform(input_nodes)
if self.device == 'cpu':
self.add_cpu_input_transform(input_nodes)
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
if op.type in ['Placeholder', 'Identity']:
self.resolved_ops[op.name] = 1
pass
elif op.type == 'Const':
pass
elif op.type == 'Reshape':
self.convert_reshape(op)
elif self.is_atrous_conv2d(op):
self.convert_atrous_conv2d(op)
elif self.check_conv_to_fc(op):
self.convert_global_conv_to_fc(op)
elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
if self.device == 'gpu' and self.check_winograd_conv(op):
self.convert_winograd_conv_gpu(op)
else:
self.convert_conv2d(op)
elif op.type == 'Conv2DBackpropInput':
self.convert_deconv2d(op)
elif op.type == 'FusedBatchNorm':
self.convert_fused_batchnorm(op)
elif op.type == 'Mul' and op.name.find('batchnorm/mul') != -1:
self.convert_batchnorm(op)
elif op.type == 'AvgPool' or op.type == 'MaxPool':
self.convert_pooling(op)
elif op.type == 'Relu6':
self.convert_relu6(op)
elif op.type == 'Add':
if len(op.inputs) > 2:
self.convert_add(op)
else:
self.convert_eltwise(op, 'ADD')
elif op.type == 'ConcatV2':
self.convert_concat(op)
elif op.type == 'ResizeBilinear':
self.convert_resize_bilinear(op)
elif op.type == 'BiasAdd':
self.convert_bias_add(op)
elif op.type == 'SpaceToBatchND':
self.convert_space_to_batch(op, False)
elif op.type == 'BatchToSpaceND':
self.convert_space_to_batch(op, True)
elif op.type == 'DepthToSpace':
self.convert_depth_to_space(op, True)
elif op.type == 'SpaceToDepth':
self.convert_depth_to_space(op, False)
elif op.type in ['Neg', 'neg', 'Negative', 'negative']:
self.convert_eltwise(op, 'NEG')
elif op.type in ['RealDiv', 'Div']:
self.convert_eltwise(op, 'DIV')
elif op.type in ['SquaredDifference']:
self.convert_eltwise(op, 'SQR_DIFF')
elif op.type in ['Pow']:
self.convert_eltwise(op, 'POW')
elif op.type == 'Mul':
self.convert_eltwise(op, 'MUL')
elif op.type == 'Sub':
self.convert_eltwise(op, 'SUB')
elif self.is_softmax(op):
self.convert_softmax(op)
elif op.type in ['Relu', 'Sigmoid', 'Tanh']:
self.convert_activation(op)
# FIXME: hardcode for inception_v3
elif op.type in ['Squeeze', 'Shape']:
self.resolved_ops[op.name] = 1
elif op.type == 'Mean':
# Global avg pooling
reduce_dims = op.inputs[1].eval()
if reduce_dims[0] == 1 and reduce_dims[1] == 2:
self.convert_global_avg_pooling(op)
self.unused_tensor.add(op.inputs[1].name)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name,
op.type))
elif op.type == 'Pad':
self.convert_pad(op)
# elif op.type in ['']:
# self.convert_normal_op(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name,
op.type))
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
elif op.type == 'Const':
self.convert_tensor(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name,
op.type))
if self.device == 'gpu':
self.add_gpu_output_transform(output_nodes)
if self.device == 'cpu':
self.add_cpu_output_transform(output_nodes)
for key in self.resolved_ops:
if self.resolved_ops[key] != 1:
print 'Unresolve Op: %s' % key
class Optimizer:
def __init__(self, net_def, device):
self.net_def = net_def
self.device = device
self.mace_graph = {}
self.tensor_map = {}
for op in net_def.op:
for input_name in op.input:
if input_name not in self.mace_graph:
self.mace_graph[input_name] = []
self.mace_graph[input_name].append(op)
for tensor in net_def.tensors:
self.tensor_map[tensor.name] = tensor
def get_buffer_tensor_name(self, name):
if self.device == 'gpu':
return name[:-6] + name[-2:]
else:
return name
def fold_batch_norm(self):
unused_tensors = set()
new_tensors = []
new_net = mace_pb2.NetDef()
resolved_ops = set()
for op in self.net_def.op:
if op.name in resolved_ops:
pass
elif op.type == 'DepthwiseConv2d' and len(op.output) == 1 and \
self.mace_graph[op.output[0]][0].type == 'FoldedBatchNorm':
depthwise_conv2d_op = op
folded_bn_op = self.mace_graph[op.output[0]][0]
weight_buffer_name = self.get_buffer_tensor_name(
depthwise_conv2d_op.input[1])
weight_tensor = self.tensor_map[weight_buffer_name]
scale_buffer_name = self.get_buffer_tensor_name(
folded_bn_op.input[1])
offset_buffer_name = self.get_buffer_tensor_name(
folded_bn_op.input[2])
scale_tensor = self.tensor_map[scale_buffer_name]
weight_shape = weight_tensor.dims
idx = 0
if self.device == 'cpu': # OIHW
for oc in range(weight_shape[0]):
for ic in range(weight_shape[1]):
for i in range(weight_shape[2]):
for j in range(weight_shape[3]):
weight_tensor.float_data[
idx] *= scale_tensor.float_data[
ic * weight_shape[0] + oc]
idx += 1
else: # HWIO
for i in range(weight_shape[0]):
for j in range(weight_shape[1]):
for ic in range(weight_shape[2]):
for oc in range(weight_shape[3]):
weight_tensor.float_data[
idx] *= scale_tensor.float_data[
ic * weight_shape[3] + oc]
idx += 1
new_tensors.append(weight_tensor)
unused_tensors.add(weight_tensor.name)
unused_tensors.add(scale_tensor.name)
if self.device == 'gpu':
scale_b2i_op = self.mace_graph[scale_buffer_name][0]
offset_b2i_op = self.mace_graph[offset_buffer_name][0]
resolved_ops.add(scale_b2i_op.name)
resolved_ops.add(offset_b2i_op.name)
new_net.op.extend([offset_b2i_op])
resolved_ops.add(depthwise_conv2d_op.name)
resolved_ops.add(folded_bn_op.name)
offset_tensor_name = folded_bn_op.input[2]
depthwise_conv2d_op.input.extend([offset_tensor_name])
for arg in folded_bn_op.arg:
if arg.name == 'activation':
act_arg = depthwise_conv2d_op.arg.add()
act_arg.name = arg.name
act_arg.s = arg.s
elif arg.name == 'max_limit':
act_arg = depthwise_conv2d_op.arg.add()
act_arg.name = arg.name
act_arg.f = arg.f
depthwise_conv2d_op.output[0] = folded_bn_op.output[0]
new_net.op.extend([depthwise_conv2d_op])
else:
new_net.op.extend([op])
for tensor in self.net_def.tensors:
if tensor.name in unused_tensors:
pass
else:
new_net.tensors.extend([tensor])
for tensor in new_tensors:
new_net.tensors.extend([tensor])
return new_net
def optimize(self):
new_net = self.fold_batch_norm()
return new_net
def add_shape_info(input_graph_def, input_nodes, input_shapes):
inputs_replaced_graph = graph_pb2.GraphDef()
for node in input_graph_def.node:
if node.name in input_nodes:
idx = input_nodes.index(node.name)
input_shape = input_shapes[idx]
placeholder_node = copy.deepcopy(node)
placeholder_node.attr.clear()
placeholder_node.attr['shape'].shape.dim.extend([
tensor_shape_pb2.TensorShapeProto.Dim(size=i)
for i in input_shape
])
placeholder_node.attr['dtype'].CopyFrom(node.attr['dtype'])
inputs_replaced_graph.node.extend([placeholder_node])
else:
inputs_replaced_graph.node.extend([copy.deepcopy(node)])
return inputs_replaced_graph
def convert_to_mace_pb(model_file, input_node, input_shape, output_node,
data_type, device, winograd):
net_def = mace_pb2.NetDef()
dt = data_type_map[data_type]
input_graph_def = tf.GraphDef()
with gfile.Open(model_file, "rb") as f:
data = f.read()
input_graph_def.ParseFromString(data)
input_nodes = [x for x in input_node.split(',')]
input_shapes = []
if input_shape != "":
input_shape_strs = [x for x in input_shape.split(':')]
for shape_str in input_shape_strs:
input_shapes.extend([[int(x) for x in shape_str.split(',')]])
output_nodes = [x for x in output_node.split(',')]
assert len(input_nodes) == len(input_shapes)
input_graph_def = add_shape_info(input_graph_def, input_nodes,
input_shapes)
with tf.Session() as session:
with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name="")
ops = graph.get_operations()
converter = TFConverter(graph, ops, net_def, dt, device, winograd)
converter.convert(input_nodes, output_nodes)
optimizer = Optimizer(net_def, device)
net_def = optimizer.optimize()
print "Model Converted."
if device == 'gpu':
print "start optimize memory."
memory_optimizer.optimize_gpu_memory(net_def)
print "Memory optimization done."
elif device == 'cpu':
print "start optimize memory."
memory_optimizer.optimize_cpu_memory(net_def)
print "Memory optimization done."
return net_def
......@@ -152,7 +152,7 @@ void CheckOutputs(const NetDef &net_def,
memcpy(input_data.data(), input.second.data().get(),
data_size * sizeof(float));
std::string input_name = MakeString("mace_input_node_",
input.first, ":0");
input.first);
net.AddInputFromArray<D, float>(input_name, input.second.shape(),
input_data);
}
......@@ -181,7 +181,7 @@ void CheckOutputs(const NetDef &net_def,
float *data = tmp_tensor->mutable_data<float>();
memcpy(data, output.second.data().get(), data_size * sizeof(float));
std::string output_name = MakeString("mace_output_node_",
output.first, ":0");
output.first);
ops::test::ExpectTensorNear<float>(*tmp_tensor,
*net.GetOutput(output_name.data()),
1e-5);
......@@ -265,7 +265,7 @@ void MaceRunFunc(const int in_out_size) {
for (size_t i = 0; i < input_names.size(); ++i) {
std::string input_name = MakeString("mace_input_node_",
input_names[i], ":0");
input_names[i]);
BufferToImage<half>(input_name, input_names[i],
mace::kernels::IN_OUT_CHANNEL,
{mem_map[input_names[i]]},
......@@ -281,7 +281,7 @@ void MaceRunFunc(const int in_out_size) {
}
for (size_t i = 0; i < output_names.size(); ++i) {
std::string output_name = MakeString("mace_output_node_",
output_names[i], ":0");
output_names[i]);
ImageToBuffer<float>(output_names[i], output_name,
mace::kernels::IN_OUT_CHANNEL, &net_def);
}
......
......@@ -162,7 +162,7 @@ void CheckOutputs(const NetDef &net_def,
memcpy(input_data.data(), input.second.data().get(),
data_size * sizeof(float));
std::string input_name = MakeString("mace_input_node_",
input.first, ":0");
input.first);
net.AddInputFromArray<D, float>(input_name, input.second.shape(),
input_data);
}
......@@ -191,7 +191,7 @@ void CheckOutputs(const NetDef &net_def,
float *data = tmp_tensor->mutable_data<float>();
memcpy(data, output.second.data().get(), data_size * sizeof(float));
std::string output_name = MakeString("mace_output_node_",
output.first, ":0");
output.first);
ops::test::ExpectTensorNear<float>(*tmp_tensor,
*net.GetOutput(output_name.data()),
1e-5);
......@@ -275,7 +275,7 @@ void MaceRun(const int in_out_size,
for (size_t i = 0; i < input_names.size(); ++i) {
std::string input_name = MakeString("mace_input_node_",
input_names[i], ":0");
input_names[i]);
BufferToImage<half>(input_name, input_names[i],
mace::kernels::IN_OUT_CHANNEL,
{mem_map[input_names[i]]},
......@@ -291,7 +291,7 @@ void MaceRun(const int in_out_size,
}
for (size_t i = 0; i < output_names.size(); ++i) {
std::string output_name = MakeString("mace_output_node_",
output_names[i], ":0");
output_names[i]);
ImageToBuffer<float>(output_names[i], output_name,
mace::kernels::IN_OUT_CHANNEL, &net_def);
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册