From 3e82ad67279c625194e46288fe7f3833ed818c53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= Date: Tue, 8 May 2018 15:53:10 +0800 Subject: [PATCH] Refactor model converter and transformer --- mace/core/mace.cc | 10 +- mace/ops/fully_connected.cc | 6 +- mace/ops/fully_connected_benchmark.cc | 4 +- mace/ops/fully_connected_test.cc | 12 +- mace/proto/mace.proto | 1 + mace/python/tools/BUILD | 36 +- mace/python/tools/caffe_converter_lib.py | 1213 ------------- mace/python/tools/convert_util.py | 6 + mace/python/tools/converter.py | 88 +- mace/python/tools/converter_tool/__init__.py | 0 .../tools/converter_tool/base_converter.py | 259 +++ .../tools/converter_tool/caffe_converter.py | 508 ++++++ .../tools/converter_tool/shape_inference.py | 149 ++ .../converter_tool/tensorflow_converter.py | 442 +++++ .../tools/converter_tool/transformer.py | 914 ++++++++++ mace/python/tools/memory_optimizer.py | 14 +- mace/python/tools/source_converter_lib.py | 13 +- mace/python/tools/tf_converter_lib.py | 1522 ----------------- mace/test/mace_api_mt_test.cc | 8 +- mace/test/mace_api_test.cc | 8 +- 20 files changed, 2411 insertions(+), 2802 deletions(-) delete mode 100644 mace/python/tools/caffe_converter_lib.py create mode 100644 mace/python/tools/converter_tool/__init__.py create mode 100644 mace/python/tools/converter_tool/base_converter.py create mode 100644 mace/python/tools/converter_tool/caffe_converter.py create mode 100644 mace/python/tools/converter_tool/shape_inference.py create mode 100644 mace/python/tools/converter_tool/tensorflow_converter.py create mode 100644 mace/python/tools/converter_tool/transformer.py delete mode 100644 mace/python/tools/tf_converter_lib.py diff --git a/mace/core/mace.cc b/mace/core/mace.cc index ae603107..dc9cbaa9 100644 --- a/mace/core/mace.cc +++ b/mace/core/mace.cc @@ -119,11 +119,11 @@ MaceEngine::Impl::Impl(const NetDef *net_def, LOG(INFO) << "MACE version: " << MaceVersion(); // Set storage path for internal usage for (auto input_name : input_nodes) { - ws_->CreateTensor(MakeString("mace_input_node_", input_name, ":0"), + ws_->CreateTensor(MakeString("mace_input_node_", input_name), GetDeviceAllocator(device_type_), DT_FLOAT); } for (auto output_name : output_nodes) { - ws_->CreateTensor(MakeString("mace_output_node_", output_name, ":0"), + ws_->CreateTensor(MakeString("mace_output_node_", output_name), GetDeviceAllocator(device_type_), DT_FLOAT); } #ifdef MACE_ENABLE_HEXAGON @@ -182,7 +182,7 @@ MaceStatus MaceEngine::Impl::Run( "The Inputs' shape must be 4-dimension with NHWC format," " please use 1 to fill missing dimensions"); Tensor *input_tensor = - ws_->GetTensor(MakeString("mace_input_node_", input.first, ":0")); + ws_->GetTensor(MakeString("mace_input_node_", input.first)); input_tensor->Resize(input.second.shape()); { Tensor::MappingGuard input_guard(input_tensor); @@ -199,7 +199,7 @@ MaceStatus MaceEngine::Impl::Run( " please use 1 to fill missing dimensions"); } Tensor *output_tensor = - ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0")); + ws_->GetTensor(MakeString("mace_output_node_", output.first)); output_tensors.push_back(output_tensor); } #ifdef MACE_ENABLE_HEXAGON @@ -223,7 +223,7 @@ MaceStatus MaceEngine::Impl::Run( #endif for (auto &output : *outputs) { Tensor *output_tensor = - ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0")); + ws_->GetTensor(MakeString("mace_output_node_", output.first)); // save output if (output_tensor != nullptr && output.second.data() != nullptr) { Tensor::MappingGuard output_guard(output_tensor); diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index acd2f6b9..d747916c 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -18,20 +18,20 @@ namespace mace { namespace ops { void Register_FullyConnected(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC") + REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") .Device(DeviceType::CPU) .TypeConstraint("T") .Build(), FullyConnectedOp); #ifdef MACE_ENABLE_OPENCL - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC") + REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), FullyConnectedOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC") + REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") .Device(DeviceType::GPU) .TypeConstraint("T") .Build(), diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc index 06127cea..96d6c3b0 100644 --- a/mace/ops/fully_connected_benchmark.cc +++ b/mace/ops/fully_connected_benchmark.cc @@ -37,7 +37,7 @@ void FCBenchmark( net.AddRandomInput("Bias", {out_channel}); if (D == DeviceType::CPU) { - OpDefBuilder("FC", "FullyConnectedTest") + OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("Input") .Input("Weight") .Input("Bias") @@ -52,7 +52,7 @@ void FCBenchmark( BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); - OpDefBuilder("FC", "FullyConnectedTest") + OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("InputImage") .Input("WeightImage") .Input("BiasImage") diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index 97afa2d4..3f107bc7 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -42,7 +42,7 @@ void Simple(const std::vector &input_shape, if (D == DeviceType::CPU) { net.Transpose2D("Weight", "WeightTranspose"); - OpDefBuilder("FC", "FullyConnectedTest") + OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("Input") .Input("Weight") .Input("Bias") @@ -59,7 +59,7 @@ void Simple(const std::vector &input_shape, BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); - OpDefBuilder("FC", "FullyConnectedTest") + OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("InputImage") .Input("WeightImage") .Input("BiasImage") @@ -142,7 +142,7 @@ void Complex(const index_t batch, "Weight", {out_channel, height * width * channels}); net.AddRandomInput("Bias", {out_channel}); - OpDefBuilder("FC", "FullyConnectedTest") + OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("Input") .Input("Weight") .Input("Bias") @@ -166,7 +166,7 @@ void Complex(const index_t batch, BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); - OpDefBuilder("FC", "FullyConnectedTest") + OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("InputImage") .Input("WeightImage") .Input("BiasImage") @@ -231,7 +231,7 @@ void TestWXFormat(const index_t batch, "Weight", {out_channel, height * width * channels}); net.AddRandomInput("Bias", {out_channel}); - OpDefBuilder("FC", "FullyConnectedTest") + OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("Input") .Input("Weight") .Input("Bias") @@ -255,7 +255,7 @@ void TestWXFormat(const index_t batch, BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); - OpDefBuilder("FC", "FullyConnectedTest") + OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("InputImage") .Input("WeightImage") .Input("BiasImage") diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index f2d0d1e4..a54fa45c 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -10,6 +10,7 @@ enum NetMode { enum DeviceType { CPU = 0; // In default, we will use CPU. GPU = 2; + HEXAGON = 3; } enum DataType { diff --git a/mace/python/tools/BUILD b/mace/python/tools/BUILD index e39922e3..e903d817 100644 --- a/mace/python/tools/BUILD +++ b/mace/python/tools/BUILD @@ -1,26 +1,19 @@ py_library( - name = "tf_converter_lib", + name = "converter_lib", srcs = [ "convert_util.py", "graph_util.py", - "tf_converter_lib.py", "tf_dsp_converter_lib.py", + "converter_tool/base_converter.py", + "converter_tool/shape_inference.py", + "converter_tool/tensorflow_converter.py", + "converter_tool/caffe_converter.py", + "converter_tool/transformer.py", ], srcs_version = "PY2AND3", deps = [ ":memory_optimizer", "//mace/proto:mace_py", - ], -) - -py_library( - name = "caffe_converter_lib", - srcs = [ - "caffe_converter_lib.py", - ], - srcs_version = "PY2AND3", - deps = [ - ":memory_optimizer", "//mace/third_party/caffe:caffe_py", ], ) @@ -37,22 +30,21 @@ py_library( ) py_binary( - name = "converter", - srcs = ["converter.py"], + name = "memory_optimizer", + srcs = ["memory_optimizer.py"], srcs_version = "PY2AND3", deps = [ - ":caffe_converter_lib", - ":source_converter_lib", - ":tf_converter_lib", - "@six_archive//:six", + "//mace/proto:mace_py", ], ) py_binary( - name = "memory_optimizer", - srcs = ["memory_optimizer.py"], + name = "converter", + srcs = ["converter.py"], srcs_version = "PY2AND3", deps = [ - "//mace/proto:mace_py", + ":converter_lib", + ":source_converter_lib", + "@six_archive//:six", ], ) diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py deleted file mode 100644 index c2bdcf30..00000000 --- a/mace/python/tools/caffe_converter_lib.py +++ /dev/null @@ -1,1213 +0,0 @@ -# Copyright 2018 Xiaomi, Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from mace.proto import mace_pb2 -from mace.third_party.caffe import caffe_pb2 -from mace.python.tools import memory_optimizer -import google.protobuf.text_format -import numpy as np -import math - -pooling_type_mode = {'AvgPool': 1, 'MaxPool': 2} - -buffer_type_map = { - 'CONV2D_FILTER': 0, - 'IN_OUT_CHANNEL': 1, - 'ARGUMENT': 2, - 'IN_OUT_HEIGHT': 3, - 'IN_OUT_WIDTH': 4, - 'WINOGRAD_FILTER': 5, - 'DW_CONV2D_FILTER': 6, - 'WEIGHT_HEIGHT': 7, - 'WEIGHT_WIDTH': 8, -} - -data_type_map = {'DT_HALF': mace_pb2.DT_HALF, 'DT_FLOAT': mace_pb2.DT_FLOAT} - -activation_name_map = { - 'ReLU': 'RELU', - 'Sigmoid': 'SIGMOID', - 'TanH': 'TANH', -} - -math_type_mode = { - 0: 2, # PROD - 1: 0, # SUM - 2: 5, # MAX -} - -MACE_INPUT_NODE_NAME = "mace_input_node" -MACE_OUTPUT_NODE_NAME = "mace_output_node" - -OPENCL_IMAGE_MAX_SIZE = 16384 - - -class Operator(object): - def __init__(self, name, type, layer): - self.name = name - self.type = type - self.layer = layer - self.parents = [] - self.children = [] - self.data = [] - self.output_shape_map = {} - - def add_parent(self, parent_op): - self.parents.append(parent_op) - parent_op.children.append(self) - - def get_single_parent(self): - if len(self.parents) != 1: - raise Exception('Operation %s expected single parent, but got %s' % - (self.name, len(self.parents))) - return self.parents[0] - - -def BlobToNPArray(blob): - if blob.num != 0: - return (np.asarray(blob.data, dtype=np.float32).reshape( - (blob.num, blob.channels, blob.height, blob.width))) - else: - return np.asarray(blob.data, dtype=np.float32).reshape(blob.shape.dim) - - -class Shapes(object): - @staticmethod - def conv_pool_shape(input_shape, - filter_shape, - paddings, - strides, - dilations, - round_func, - input_format='NHWC'): - output_shape = np.zeros_like(input_shape) - output_shape[0] = input_shape[0] - if input_format == 'NHWC': - # input format: NHWC, filter format: HWOI - output_shape[1] = int( - round_func((input_shape[1] + paddings[0] - filter_shape[0] - - (filter_shape[0] - 1) * - (dilations[0] - 1)) / float(strides[0]))) + 1 - output_shape[2] = int( - round_func((input_shape[2] + paddings[1] - filter_shape[1] - - (filter_shape[1] - 1) * - (dilations[1] - 1)) / float(strides[1]))) + 1 - output_shape[3] = filter_shape[2] - elif input_format == 'NCHW': - # input format: NCHW, filter format: OIHW - output_shape[1] = filter_shape[0] - output_shape[2] = int( - round_func((input_shape[2] + paddings[0] - filter_shape[2] - - (filter_shape[2] - 1) * - (dilations[0] - 1)) / float(strides[0]))) + 1 - output_shape[3] = int( - round_func((input_shape[3] + paddings[1] - filter_shape[3] - - (filter_shape[3] - 1) * - (dilations[1] - 1)) / float(strides[1]))) + 1 - else: - raise Exception("format %s is not supported" % input_format) - - return output_shape - - @staticmethod - def fully_connected_shape(input_shape, weight_shape, input_format='NHWC'): - if input_format == 'NHWC': - return [input_shape[0], 1, 1, weight_shape[0]] - elif input_format == 'NCHW': - return [input_shape[0], weight_shape[0], 1, 1] - else: - raise Exception("format %s is not supported" % input_format) - - @staticmethod - def concat_shape(input_shapes, axis): - output_shape = None - for input_shape in input_shapes: - if output_shape is None: - output_shape = list(input_shape) - else: - output_shape[axis] += input_shape[axis] - return output_shape - - @staticmethod - def slice_shape(input_shape, num_output, input_format='NHWC'): - if input_format == 'NHWC': - return [ - input_shape[0], input_shape[1], input_shape[2], - input_shape[3] / num_output - ] - elif input_format == 'NCHW': - return [ - input_shape[0], input_shape[1] / num_output, input_shape[2], - input_shape[3] - ] - else: - raise Exception("format %s is not supported" % input_format) - - -# outputs' name is [op.name + '_' + #] -class CaffeConverter(object): - def __init__(self, caffe_net, weights, net_def, dt, device, winograd): - self.net_def = net_def - self.caffe_net = caffe_net - self.weights = weights - self.dt = dt - self.device = device - self.winograd = winograd - self.resolved_ops = set() - self.ops = [] - self.inputs_map = {} # caffe op name -> mace inputs' name - - # Add Input operations - top_name_map = {} - inputs = caffe_net.input - for input in inputs: - self.ops.extend([Operator(input, 'Input', None)]) - top_name_map[input] = input - - layers = caffe_net.layer - # remove train layers and dropout - layers = self.remove_unused_layers(layers) - - # Construct graph - # Only support single-output layer - # layer with single output often use the same top name. - self.ops.extend( - [Operator(layer.name, layer.type, layer) for layer in layers]) - - self.ops_map = {op.name: op for op in self.ops} - output_op_map = {} - for layer in layers: - op = self.ops_map[layer.name] - for input_name in layer.bottom: - assert input_name != layer.name - parent_op = output_op_map.get(input_name) - if parent_op is None: - parent_op = self.ops_map[input_name] - op.add_parent(parent_op) - if op.name not in self.inputs_map: - self.inputs_map[op.name] = [] - self.inputs_map[op.name].extend([top_name_map[input_name]]) - for i in range(len(layer.top)): - output_name = layer.top[i] - if len(layer.top) == 1: - top_name_map[output_name] = op.name - else: - top_name_map[output_name] = op.name + '_' + str(i) - if output_name == layer.name: - continue - output_op_map[output_name] = op - - # Load weights - weights_layers = weights.layer - for layer in weights_layers: - if not layer.blobs: - continue - if layer.name in self.ops_map: - op = self.ops_map[layer.name] - op.data = [BlobToNPArray(blob) for blob in layer.blobs] - - # toposort ops - self.ops = self.toposort_ops() - - def CommonConvert(self, op, mace_type): - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'cpu': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - op_def.name = op.name - op_def.type = mace_type - op_def.input.extend([name + ':0' for name in self.inputs_map[op.name]]) - return op_def - - def remove_unused_layers(self, layers): - phase_map = {0: 'train', 1: 'test'} - test_layers_names = set() - test_layers = [] - for layer in layers: - phase = 'test' - if len(layer.include): - phase = phase_map[layer.include[0].phase] - if len(layer.exclude): - phase = phase_map[layer.exclude[0].phase] - if phase == 'test' and layer.type != 'Dropout': - test_layers.append(layer) - assert layer.name not in test_layers_names - test_layers_names.add(layer.name) - return test_layers - - def toposort_ops(self): - sorted_ops = [] - temp_visited = set() - visited = set() - - def search(op): - if op.name in temp_visited: - raise Exception("The model is not DAG") - if op.name in visited: - return - temp_visited.add(op.name) - for parent_op in op.parents: - search(parent_op) - temp_visited.remove(op.name) - sorted_ops.append(op) - visited.add(op.name) - - for op in self.ops: - search(op) - - return sorted_ops - - def add_buffer_to_image(self, input_name, input_type): - output_name = input_name[:-2] + "_b2i" + input_name[-2:] - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'BufferToImage' - op_def.input.extend([input_name]) - op_def.output.extend([output_name]) - - arg = op_def.arg.add() - arg.name = 'buffer_type' - arg.i = buffer_type_map[input_type] - arg = op_def.arg.add() - arg.name = 'mode' - arg.i = 0 - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - return output_name - - def add_image_to_buffer(self, input_name, input_type): - output_name = input_name[:-2] + "_i2b" + input_name[-2:] - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'ImageToBuffer' - op_def.input.extend([input_name]) - op_def.output.extend([output_name]) - - arg = op_def.arg.add() - arg.name = 'buffer_type' - arg.i = buffer_type_map[input_type] - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - return output_name - - def add_gpu_input_transform(self, names): - for name in names: - new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = name - op_def.type = 'BufferToImage' - op_def.input.extend([new_input_name]) - op_def.output.extend([name + ':0']) - - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'buffer_type' - epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] - - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - - input_op = self.ops_map[name] - if input_op.layer is not None: - output_shape = input_op.output_shape_map[input_op.layer.top[0]] - else: - output_shape = input_op.output_shape_map[input_op.name] - self.add_output_shape(op_def, output_shape) - - def add_gpu_output_transform(self, names): - for name in names: - output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'ImageToBuffer' - op_def.input.extend([name + ':0']) - op_def.output.extend([output_name]) - - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'buffer_type' - epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] - - def add_tensor(self, name, value): - tensor = self.net_def.tensors.add() - tensor.name = name - - shape = list(value.shape) - tensor.dims.extend(shape) - - tensor.data_type = mace_pb2.DT_FLOAT - tensor.float_data.extend(value.flat) - - @staticmethod - def add_output_shape(op_def, output_shape): - mace_output_shape = mace_pb2.OutputShape() - mace_output_shape.dims.extend(output_shape) - op_def.output_shape.extend([mace_output_shape]) - - def add_stride_pad_kernel_arg(self, param, op_def): - try: - if len(param.stride) > 1 or len(param.kernel_size) > 1 or len( - param.pad) > 1: - raise Exception( - 'Mace does not support multiple stride/kernel_size/pad') - stride = [param.stride[0], - param.stride[0]] if len(param.stride) else [1, 1] - pad = [param.pad[0] * 2, - param.pad[0] * 2] if len(param.pad) else [0, 0] - kernel = [param.kernel_size[0], param.kernel_size[0]] if len( - param.kernel_size) else [0, 0] - except TypeError: - stride = [param.stride, param.stride] - pad = [param.pad * 2, param.pad * 2] - kernel = [param.kernel_size, param.kernel_size] - - if param.HasField("stride_h") or param.HasField("stride_w"): - stride = [param.stride_h, param.stride_w] - # Pad - if param.HasField("pad_h") or param.HasField("pad_w"): - pad = [param.pad_h * 2, param.pad_w * 2] - - if op_def is not None: - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend(stride) - - padding_arg = op_def.arg.add() - padding_arg.name = 'padding_values' - padding_arg.ints.extend(pad) - - if op_def.type == 'Pooling': - if param.HasField("kernel_h") or param.HasField("kernel_w"): - kernel = [param.kernel_h, param.kernel_w] - - return pad, stride, kernel - - def convert_conv2d(self, op): - use_winograd = False - if self.device == 'cpu': - use_winograd = self.check_winograd_conv(op) - - param = op.layer.convolution_param - is_depthwise = False - if param.HasField('group'): - if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1: - is_depthwise = True - else: - raise Exception("Mace do not support group convolution yet") - - if is_depthwise: - op_def = self.CommonConvert(op, 'DepthwiseConv2d') - else: - op_def = self.CommonConvert(op, 'Conv2D') - - # Add filter - weight_tensor_name = op.name + '_weight:0' - if self.device == 'cpu': - weight_data = op.data[0] - else: - # OIHW -> HWOI - weight_data = op.data[0].transpose((2, 3, 0, 1)) - - if use_winograd: - self.convert_winograd_conv_filter_cpu(op, op_def) - elif self.device == 'gpu': - self.add_tensor(weight_tensor_name, weight_data) - buffer_type = "DW_CONV2D_FILTER" \ - if is_depthwise else "CONV2D_FILTER" - output_name = self.add_buffer_to_image(weight_tensor_name, - buffer_type) - op_def.input.extend([output_name]) - else: - self.add_tensor(weight_tensor_name, weight_data) - op_def.input.extend([weight_tensor_name]) - - # Add Bias - if len(op.data) == 2: - bias_tensor_name = op.name + '_bias:0' - bias_data = op.data[1].reshape(-1) - self.add_tensor(bias_tensor_name, bias_data) - if self.device == 'gpu': - output_name = self.add_buffer_to_image(bias_tensor_name, - "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([bias_tensor_name]) - - paddings, strides, _ = self.add_stride_pad_kernel_arg(param, op_def) - dilations = [1, 1] - if len(param.dilation) > 0: - dilation_arg = op_def.arg.add() - dilation_arg.name = 'dilations' - if len(param.dilation) == 1: - dilations = [param.dilation[0], param.dilation[0]] - elif len(param.dilation) == 2: - dilations = [param.dilation[0], param.dilation[1]] - dilation_arg.ints.extend(dilations) - final_op = op - self.resolved_ops.add(op.name) - - input_format = 'NCHW' if self.device == 'cpu' else 'NHWC' - output_shape = Shapes.conv_pool_shape( - op.get_single_parent().output_shape_map[op.layer.bottom[0]], - weight_data.shape, paddings, strides, dilations, math.floor, - input_format) - op.output_shape_map[op.layer.top[0]] = output_shape - - if len(self.ops_map[final_op.name].children) == 1 and \ - self.ops_map[final_op.name].children[0].type \ - in activation_name_map: - activation_op = self.ops_map[final_op.name].children[0] - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - final_op = activation_op - final_op.output_shape_map[final_op.layer.top[0]] = output_shape - self.resolved_ops.add(activation_op.name) - - op_def.output.extend([final_op.name + ':0']) - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - - def check_winograd_conv(self, op): - param = op.layer.convolution_param - filter_shape = np.asarray(op.data[0].shape) - if self.device != 'cpu': - filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI - paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) - - if param.HasField('group'): - if param.group == op.data[0].shape[0] and op.data[0].shape[1] == 1: - return False # Depthwise conv not support winograd - else: - raise Exception("Mace do not support group convolution yet") - - dilations = [1, 1] - if len(param.dilation) > 0: - if len(param.dilation) == 1: - dilations = [param.dilation[0], param.dilation[0]] - elif len(param.dilation) == 2: - dilations = [param.dilation[0], param.dilation[1]] - - input_format = 'NCHW' if self.device == 'cpu' else 'NHWC' - output_shape = Shapes.conv_pool_shape( - op.get_single_parent().output_shape_map[op.layer.bottom[0]], - filter_shape, paddings, strides, dilations, math.floor, - input_format) - if self.winograd and dilations[0] == 1 and \ - (dilations[0] == dilations[1]) and \ - (strides[0] == 1) and (strides[0] == strides[1]): - if self.device == 'gpu': - width = output_shape[0] * ((output_shape[1] + 1) / 2) * \ - ((output_shape[2] + 1) / 2) - return filter_shape[0] == 3 and \ - filter_shape[0] == filter_shape[1] and \ - (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ - (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ - (width < OPENCL_IMAGE_MAX_SIZE) - elif self.device == 'cpu': - return filter_shape[2] == 3 and \ - filter_shape[2] == filter_shape[3] and \ - filter_shape[0] >= 8 and filter_shape[1] >= 8 - return False - - def convert_winograd_conv_filter_cpu(self, op, op_def): - # Add filter - weight_tensor_name = op.name + '_weight:0' - weight_data = op.data[0] # OIHW - input_shape = op.get_single_parent().output_shape_map[ - op.layer.bottom[0]] - if input_shape[2] > 16 and input_shape[3] > 16: - G = np.array([ - [1.0, 0.0, 0.0], - [-2.0 / 9, -2.0 / 9, -2.0 / 9], - [-2.0 / 9, 2.0 / 9, -2.0 / 9], - [1.0 / 90, 1.0 / 45, 2.0 / 45], - [1.0 / 90, -1.0 / 45, 2.0 / 45], - [1.0 / 45, 1.0 / 90, 1.0 / 180], - [1.0 / 45, -1.0 / 90, 1.0 / 180], - [0.0, 0.0, 1.0] - ], dtype=np.float32) - new_shape = [64, weight_data.shape[0], weight_data.shape[1]] # TOC - else: - G = np.array([ - [1.0, 0.0, 0.0], - [0.5, 0.5, 0.5], - [0.5, -0.5, 0.5], - [0.0, 0.0, 1.0], - ], dtype=np.float32) - new_shape = [16, weight_data.shape[0], weight_data.shape[1]] # TOC - new_weight_value = G.dot(weight_data).dot(G.T) # [8, O, I, 8] - new_weight_value = new_weight_value.transpose(0, 3, 1, 2) - new_weight_value = new_weight_value.reshape(new_shape) - - self.add_tensor(weight_tensor_name, new_weight_value) - - op_def.input.extend([weight_tensor_name]) - winograd_transformed_arg = op_def.arg.add() - winograd_transformed_arg.name = 'is_filter_transformed' - winograd_transformed_arg.i = 1 - - def convert_winograd_conv_gpu(self, op): - # Add filter - weight_tensor_name = op.name + '_weight:0' - self.add_tensor(weight_tensor_name, op.data[0]) - - buffer_type = "WINOGRAD_FILTER" - filter_name = self.add_buffer_to_image(weight_tensor_name, buffer_type) - - param = op.layer.convolution_param - paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None) - - filter_shape = np.asarray(op.data[0].shape) - filter_shape = filter_shape[[2, 3, 0, 1]] # OIHW -> HWOI - - input_format = 'NHWC' - output_shape = Shapes.conv_pool_shape( - op.get_single_parent().output_shape_map[op.layer.bottom[0]], - filter_shape, paddings, strides, [1, 1], math.floor, input_format) - - # Input transform - wt_op = mace_pb2.OperatorDef() - arg = wt_op.arg.add() - arg.name = 'T' - arg.i = self.dt - padding_arg = wt_op.arg.add() - padding_arg.name = 'padding_values' - padding_arg.ints.extend(paddings) - wt_op.name = op.name + '_input_transform' - wt_op.type = 'WinogradTransform' - wt_op.input.extend([name + ':0' for name in self.inputs_map[op.name]]) - wt_output_name = wt_op.name + ":0" - wt_op.output.extend([wt_output_name]) - wt_output_shape = mace_pb2.OutputShape() - wt_output_width = output_shape[0] * (( - output_shape[1] + 1) / 2) * ((output_shape[2] + 1) / 2) - wt_output_shape.dims.extend( - [16, filter_shape[3], wt_output_width, 1]) - wt_op.output_shape.extend([wt_output_shape]) - - # MatMul - matmul_op = mace_pb2.OperatorDef() - arg = matmul_op.arg.add() - arg.name = 'T' - arg.i = self.dt - matmul_op.name = op.name + '_matmul' - matmul_op.type = 'MatMul' - matmul_op.input.extend([filter_name, wt_output_name]) - matmul_output_name = matmul_op.name + ":0" - matmul_op.output.extend([matmul_output_name]) - matmul_output_shape = mace_pb2.OutputShape() - matmul_output_shape.dims.extend( - [16, filter_shape[2], wt_output_width, 1]) - matmul_op.output_shape.extend([matmul_output_shape]) - - # Inverse transform - iwt_op = mace_pb2.OperatorDef() - arg = iwt_op.arg.add() - arg.name = 'T' - arg.i = self.dt - batch_arg = iwt_op.arg.add() - batch_arg.name = 'batch' - batch_arg.i = output_shape[0] - height_arg = iwt_op.arg.add() - height_arg.name = 'height' - height_arg.i = output_shape[1] - width_arg = iwt_op.arg.add() - width_arg.name = 'width' - width_arg.i = output_shape[2] - iwt_op.name = op.name + '_inverse_transform' - iwt_op.type = 'WinogradInverseTransform' - iwt_op.input.extend([matmul_output_name]) - - # Add Bias - if len(op.data) == 2: - bias_tensor_name = op.name + '_bias:0' - bias_data = op.data[1].reshape(-1) - self.add_tensor(bias_tensor_name, bias_data) - output_name = self.add_buffer_to_image(bias_tensor_name, - "ARGUMENT") - iwt_op.input.extend([output_name]) - - final_op = op - final_op.output_shape_map[final_op.layer.top[0]] = output_shape - self.resolved_ops.add(op.name) - - if len(self.ops_map[final_op.name].children) == 1 and \ - self.ops_map[final_op.name].children[0].type \ - in activation_name_map: - activation_op = self.ops_map[final_op.name].children[0] - fused_act_arg = iwt_op.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - final_op = activation_op - final_op.output_shape_map[final_op.layer.top[0]] = output_shape - self.resolved_ops.add(activation_op.name) - - iwt_op.output.extend([final_op.name + ':0']) - self.add_output_shape(iwt_op, output_shape) - self.net_def.op.extend([wt_op, matmul_op, iwt_op]) - - def convert_batchnorm(self, op): - if len(op.children) != 1 or op.children[0].type != 'Scale': - raise Exception('Now only support BatchNorm+Scale') - op_def = self.CommonConvert(op, 'FoldedBatchNorm') - scale_op = op.children[0] - - epsilon_value = op.layer.batch_norm_param.eps - if op.data[2][0] != 0: - mean_value = (1. / op.data[2][0]) * op.data[0] - var_value = (1. / op.data[2][0]) * op.data[1] - else: - raise RuntimeError('scalar is zero.') - - gamma_value = scale_op.data[0] - beta_value = np.zeros_like(mean_value) - if len(scale_op.data) == 2: - beta_value = scale_op.data[1] - - scale_value = (( - 1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) * - gamma_value).reshape(-1) - offset_value = ((-mean_value * scale_value) + beta_value).reshape(-1) - input_names = [op.name + '_scale:0', op.name + '_offset:0'] - self.add_tensor(input_names[0], scale_value) - self.add_tensor(input_names[1], offset_value) - - if self.device == 'gpu': - for name in input_names: - output_name = self.add_buffer_to_image(name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([name for name in input_names]) - - self.resolved_ops.add(op.name) - self.resolved_ops.add(scale_op.name) - final_op = scale_op - - output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[ - 0]] - - if len(self.ops_map[final_op.name].children) == 1 and \ - self.ops_map[final_op.name].children[0].type \ - in activation_name_map: - activation_op = self.ops_map[final_op.name].children[0] - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - final_op = activation_op - final_op.output_shape_map[final_op.layer.top[0]] = output_shape - self.resolved_ops.add(activation_op.name) - - op_def.output.extend([final_op.name + ':0']) - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - - def convert_inner_product(self, op): - param = op.layer.inner_product_param - try: - if param.axis != 1 or param.transpose: - raise ValueError( - 'Do not support non-default axis and transpose ' - 'case for innner product') - except AttributeError: - pass - - op_def = self.CommonConvert(op, 'FC') - weight_tensor_name = op.name + '_weight:0' - if op.data[0].ndim not in [2, 4]: - raise ValueError('Unexpected weigth ndim.') - if op.data[0].ndim == 4 and list(op.data[0].shape[:2]) != [1, 1]: - raise ValueError( - 'Do not support 4D weight with shape [1, 1, *, *]') - input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[ - 0]] - - weight_data = op.data[0].reshape(-1, op.data[0].shape[-1]) - assert weight_data.shape[1] == ( - input_shape[1] * input_shape[2] * input_shape[3]) - if self.device != 'cpu': - weight_data = weight_data.reshape(-1, input_shape[3], - input_shape[1], input_shape[2]) - weight_data = weight_data.transpose((0, 2, 3, 1)).reshape( - weight_data.shape[0], -1) - self.add_tensor(weight_tensor_name, weight_data) - if self.device == 'gpu': - if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE and \ - (weight_data.shape[1] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE: - raise Exception( - 'Mace gpu do not support FC with weight shape: ' + - str(weight_data.shape)) - if input_shape[3] % 4 == 0: - buffer_type = "WEIGHT_WIDTH" - else: - buffer_type = "WEIGHT_HEIGHT" - weight_type_arg = op_def.arg.add() - weight_type_arg.name = 'weight_type' - weight_type_arg.i = buffer_type_map['WEIGHT_HEIGHT'] - - if buffer_type == "WEIGHT_HEIGHT" and \ - (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE: - raise Exception( - 'Mace gpu do not support FC with weight shape: ' + - str(weight_data.shape)) - output_name = self.add_buffer_to_image(weight_tensor_name, - buffer_type) - op_def.input.extend([output_name]) - else: - op_def.input.extend([weight_tensor_name]) - - # Add Bias - if len(op.data) == 2: - bias_tensor_name = op.name + '_bias:0' - bias_data = op.data[1].reshape(-1) - self.add_tensor(bias_tensor_name, bias_data) - if self.device == 'gpu': - output_name = self.add_buffer_to_image(bias_tensor_name, - "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([bias_tensor_name]) - - self.resolved_ops.add(op.name) - input_format = 'NCHW' if self.device == 'cpu' else 'NHWC' - output_shape = Shapes.fully_connected_shape(input_shape, - weight_data.shape, - input_format) - op.output_shape_map[op.layer.top[0]] = output_shape - final_op = op - - if len(self.ops_map[final_op.name].children) == 1 \ - and self.ops_map[final_op.name].children[0].type \ - in activation_name_map: - activation_op = self.ops_map[final_op.name].children[0] - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - final_op = activation_op - final_op.output_shape_map[final_op.layer.top[0]] = output_shape - self.resolved_ops.add(activation_op.name) - - op_def.output.extend([final_op.name + ':0']) - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - - def convert_pooling(self, op): - op_def = self.CommonConvert(op, 'Pooling') - - param = op.layer.pooling_param - paddings, strides, kernels = self.add_stride_pad_kernel_arg( - param, op_def) - if param.pool == caffe_pb2.PoolingParameter.MAX: - pooling_type = "MaxPool" - elif param.pool == caffe_pb2.PoolingParameter.AVE: - pooling_type = "AvgPool" - pooling_type_arg = op_def.arg.add() - pooling_type_arg.name = 'pooling_type' - pooling_type_arg.i = pooling_type_mode[pooling_type] - - input_shape = op.get_single_parent().output_shape_map[op.layer.bottom[ - 0]] - if param.HasField('global_pooling') and param.global_pooling: - kernels = [input_shape[2], input_shape[3]] \ - if self.device == 'cpu' else \ - [input_shape[1], input_shape[2]] - - kernel_arg = op_def.arg.add() - kernel_arg.name = 'kernels' - kernel_arg.ints.extend(kernels) - - if self.device != 'cpu': - filter_shape = [ - kernels[0], kernels[1], input_shape[3], input_shape[3] - ] - else: - filter_shape = [ - input_shape[1], input_shape[1], kernels[0], kernels[1] - ] - input_format = 'NCHW' if self.device == 'cpu' else 'NHWC' - output_shape = Shapes.conv_pool_shape(input_shape, filter_shape, - paddings, strides, [1, 1], - math.ceil, input_format) - op.output_shape_map[op.layer.top[0]] = output_shape - - op_def.output.extend([op.name + ':0']) - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_activation(self, op): - op_def = self.CommonConvert(op, 'Activation') - activation_arg = op_def.arg.add() - activation_arg.name = 'activation' - activation_arg.s = activation_name_map[op.type] - op_def.output.extend([op.name + ':0']) - output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[ - 0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_prelu(self, op): - op_def = self.CommonConvert(op, 'Activation') - activation_arg = op_def.arg.add() - activation_arg.name = 'activation' - activation_arg.s = 'PRELU' - alpha_tensor_name = op.name + '_alpha:0' - alpha_data = op.data[0].reshape(-1) - self.add_tensor(alpha_tensor_name, alpha_data) - if self.device == 'gpu': - output_name = self.add_buffer_to_image(alpha_tensor_name, - "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([alpha_tensor_name]) - op_def.output.extend([op.name + ':0']) - output_shape = op.get_single_parent().output_shape_map[op.layer.bottom[ - 0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_add(self, op): - op_def = self.CommonConvert(op, 'AddN') - op_def.output.extend([op.name + ':0']) - output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_concat(self, op): - op_def = self.CommonConvert(op, 'Concat') - axis_arg = op_def.arg.add() - axis_arg.name = 'axis' - axis_arg.i = 3 if self.device != 'cpu' else 1 - try: - if op.layer.concat_param.HasFeild('axis'): - axis_arg.i = op.concat_param.axis - elif op.layer.concat_param.HasFeild('concat_dim'): - axis_arg.i = op.concat_param.concat_dim - except AttributeError: - pass - - input_shapes = [] - for i in range(len(op.parents)): - input_shapes.append( - op.parents[i].output_shape_map[op.layer.bottom[i]]) - output_shape = Shapes.concat_shape(input_shapes, axis_arg.i) - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_eltwise(self, op): - op_def = self.CommonConvert(op, 'Eltwise') - param = op.layer.eltwise_param - type_arg = op_def.arg.add() - type_arg.name = 'type' - type_arg.i = math_type_mode[param.operation] - if len(param.coeff) > 0: - coeff_arg = op_def.arg.add() - coeff_arg.name = 'coeff' - coeff_arg.floats.extend(list(param.coeff)) - - output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_slice(self, op): - op_def = self.CommonConvert(op, 'Slice') - if op.layer.HasField('slice_param'): - param = op.layer.slice_param - if param.HasField('axis') and param.axis != 1: - raise Exception( - 'Mace do not support slice with axis ' + str(param.axis)) - if len(param.slice_point) > 0: - raise Exception('Mace do not support slice with slice_point') - - axis_arg = op_def.arg.add() - axis_arg.name = 'axis' - axis_arg.i = 3 if self.device != 'cpu' else 1 - - input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - num_outputs = len(op.layer.top) - input_channels = input_shape[axis_arg.i] - if (input_channels % num_outputs) != 0 or \ - (self.device == 'gpu' and - ((input_channels / num_outputs) % 4 != 0)): - raise Exception( - 'Mace do not support slice with input shape ' + - str(input_shape) + ' and number of output ' + str(num_outputs)) - input_format = 'NCHW' if self.device == 'cpu' else 'NHWC' - output_shape = Shapes.slice_shape(input_shape, num_outputs, - input_format) - for i in range(len(op.layer.top)): - op.output_shape_map[op.layer.top[i]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + '_' + str(i) + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_normal_op(self, op): - op_def = self.CommonConvert(op, op.type) - output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_reshape(self, op): - op_def = self.CommonConvert(op, 'Reshape') - input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - output_shape = input_shape - shape_param = np.asarray(op.layer.reshape_param.shape.dim) - for i in range(len(shape_param)): - if shape_param[i] != 0: - output_shape[i] = shape_param[i] - shape_arg = op_def.arg.add() - shape_arg.name = 'shape' - shape_arg.ints.extend(output_shape) - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_proposal_op(self, op): - assert self.device == 'cpu' - op_def = self.CommonConvert(op, op.type) - if op.layer.HasField('proposal_param'): - proposal_param = op.layer.proposal_param - feat_stride_arg = op_def.arg.add() - feat_stride_arg.name = 'feat_stride' - feat_stride_arg.i = proposal_param.feat_stride - scales_arg = op_def.arg.add() - scales_arg.name = 'scales' - scales_arg.ints.extend(list(proposal_param.scales)) - ratios_arg = op_def.arg.add() - ratios_arg.name = 'ratios' - ratios_arg.floats.extend(list(proposal_param.ratios)) - output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def convert_psroi_align(self, op): - assert self.device == 'cpu' - op_def = self.CommonConvert(op, op.type) - if op.layer.HasField('psroi_align_param'): - psroi_align_param = op.layer.psroi_align_param - spatial_scale_arg = op_def.arg.add() - spatial_scale_arg.name = 'spatial_scale' - spatial_scale_arg.f = psroi_align_param.spatial_scale - output_dim_arg = op_def.arg.add() - output_dim_arg.name = 'output_dim' - output_dim_arg.i = psroi_align_param.output_dim - group_size_arg = op_def.arg.add() - group_size_arg.name = 'group_size' - group_size_arg.i = psroi_align_param.group_size - output_shape = op.parents[0].output_shape_map[op.layer.bottom[0]] - op.output_shape_map[op.layer.top[0]] = output_shape - self.add_output_shape(op_def, output_shape) - op_def.output.extend([op.name + ':0']) - self.net_def.op.extend([op_def]) - self.resolved_ops.add(op.name) - - def replace_in_out_name(self, input_names, output_names): - in_names = set([input_name + ":0" for input_name in input_names]) - out_names = set([output_name + ":0" for output_name in output_names]) - for op in self.net_def.op: - for i in range(len(op.input)): - if op.input[i] in in_names: - op.input[i] = MACE_INPUT_NODE_NAME + '_' + op.input[i] - if op.input[i] in out_names: - op.input[i] = MACE_OUTPUT_NODE_NAME + '_' + op.input[i] - for i in range(len(op.output)): - if op.output[i] in in_names: - op.output[i] = MACE_INPUT_NODE_NAME + '_' + op.output[i] - if op.output[i] in out_names: - op.output[i] = MACE_OUTPUT_NODE_NAME + '_' + op.output[i] - - def add_input_op_shape(self, input_nodes, input_shapes): - assert len(input_nodes) == len(input_shapes) - for i in range(len(input_nodes)): - input_op = self.ops_map[input_nodes[i]] - input_shape = input_shapes[i] if self.device != 'cpu' else \ - [input_shapes[i][0], input_shapes[i][3], - input_shapes[i][1], input_shapes[i][2]] - if input_op.layer is not None: - input_op.output_shape_map[input_op.layer.top[0]] = input_shape - else: - input_op.output_shape_map[input_op.name] = input_shape - - def add_cpu_input_transform(self, names): - for name in names: - new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = name - op_def.type = 'Transpose' - op_def.input.extend([new_input_name]) - op_def.output.extend([name + ':0']) - - dims_arg = op_def.arg.add() - dims_arg.name = 'dims' - dims_arg.ints.extend([0, 3, 1, 2]) # NHWC -> NCHW - - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - - input_op = self.ops_map[name] - if input_op.layer is not None: - output_shape = input_op.output_shape_map[input_op.layer.top[0]] - else: - output_shape = input_op.output_shape_map[input_op.name] - self.add_output_shape(op_def, output_shape) - - def add_cpu_output_transform(self, names): - for name in names: - output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'Transpose' - op_def.input.extend([name + ':0']) - op_def.output.extend([output_name]) - - dims_arg = op_def.arg.add() - dims_arg.name = 'dims' - dims_arg.ints.extend([0, 2, 3, 1]) # NCHW -> NHWC - - input_op = self.ops_map[name] - if input_op.layer is not None: - output_shape = input_op.output_shape_map[input_op.layer.top[0]] - else: - output_shape = input_op.output_shape_map[input_op.name] - self.add_output_shape(op_def, - [output_shape[0], output_shape[2], - output_shape[3], output_shape[1]]) - - def convert(self, input_nodes, input_shapes, output_nodes): - assert self.ops[0].type == 'Input' - self.add_input_op_shape(input_nodes, input_shapes) - - if self.device == 'gpu': - self.add_gpu_input_transform(input_nodes) - - if self.device == 'cpu': - self.add_cpu_input_transform(input_nodes) - - for op in self.ops: - if op.name in self.resolved_ops: - continue - if op.type == 'Input': - self.resolved_ops.add(op.name) - elif op.type == 'Convolution': - if self.device == 'gpu' and self.check_winograd_conv(op): - self.convert_winograd_conv_gpu(op) - else: - self.convert_conv2d(op) - elif op.type == 'BatchNorm': - self.convert_batchnorm(op) - elif op.type == 'InnerProduct': - self.convert_inner_product(op) - elif op.type == 'Pooling': - self.convert_pooling(op) - elif op.type == 'PReLU': - self.convert_prelu(op) - elif op.type in ['ReLU', 'Sigmoid', 'TanH']: - self.convert_activation(op) - elif op.type == 'Add': - self.convert_add(op) - elif op.type == 'Concat': - self.convert_concat(op) - elif op.type == 'Eltwise': - self.convert_eltwise(op) - elif op.type == 'Slice': - self.convert_slice(op) - elif op.type == 'Reshape': - self.convert_reshape(op) - elif op.type == 'Proposal': - self.convert_proposal_op(op) - elif op.type == 'PSROIAlign': - self.convert_psroi_align(op) - elif op.type in ['Softmax']: - self.convert_normal_op(op) - else: - raise Exception('Unknown Op: %s, type: %s' % (op.name, - op.type)) - - if self.device == 'gpu': - self.add_gpu_output_transform(output_nodes) - - if self.device == 'cpu': - self.add_cpu_output_transform(output_nodes) - - for op in self.ops: - if op.name not in self.resolved_ops: - print 'Unresolve Op: %s with type %s' % (op.name, op.type) - - -def convert_to_mace_pb(model_file, weight_file, input_node_str, - input_shape_str, output_node_str, data_type, device, - winograd): - net_def = mace_pb2.NetDef() - dt = data_type_map[data_type] - - caffe_net = caffe_pb2.NetParameter() - with open(model_file, "r") as f: - google.protobuf.text_format.Merge(str(f.read()), caffe_net) - - weights = caffe_pb2.NetParameter() - with open(weight_file, "rb") as f: - weights.MergeFromString(f.read()) - - input_nodes = [x for x in input_node_str.split(',')] - input_shapes = [] - if input_shape_str != "": - input_shape_strs = [x for x in input_shape_str.split(':')] - for shape_str in input_shape_strs: - input_shapes.extend([[int(x) for x in shape_str.split(',')]]) - output_nodes = [x for x in output_node_str.split(',')] - assert len(input_nodes) == len(input_shapes) - - converter = CaffeConverter(caffe_net, weights, net_def, dt, device, - winograd) - converter.convert(input_nodes, input_shapes, output_nodes) - print "PB Converted." - if device == 'gpu': - print "start optimize memory." - memory_optimizer.optimize_gpu_memory(net_def) - print "Memory optimization done." - elif device == 'cpu': - print "start optimize memory." - memory_optimizer.optimize_cpu_memory(net_def) - print "Memory optimization done." - - return net_def diff --git a/mace/python/tools/convert_util.py b/mace/python/tools/convert_util.py index c4fcba43..4643931b 100644 --- a/mace/python/tools/convert_util.py +++ b/mace/python/tools/convert_util.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import tensorflow as tf from mace.proto import mace_pb2 @@ -40,3 +41,8 @@ def tf_dtype_2_mace_dtype(tf_dtype): if not mace_dtype: raise Exception("Not supported tensorflow dtype: " + tf_dtype) return mace_dtype + + +def mace_check(condition, msg): + if not condition: + raise Exception(msg) diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index fffa8d48..dda67432 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -16,7 +16,16 @@ import argparse import sys import hashlib import os.path + +from mace.proto import mace_pb2 +from mace.python.tools import tf_dsp_converter_lib +from mace.python.tools import memory_optimizer from mace.python.tools import source_converter_lib +from mace.python.tools.converter_tool import base_converter as cvt +from mace.python.tools.converter_tool import tensorflow_converter +from mace.python.tools.converter_tool import caffe_converter +from mace.python.tools.converter_tool import transformer + # ./bazel-bin/mace/python/tools/tf_converter --model_file quantized_test.pb \ # --output quantized_test_dsp.pb \ @@ -25,6 +34,12 @@ from mace.python.tools import source_converter_lib FLAGS = None +data_type_map = {'DT_HALF': mace_pb2.DT_HALF, + 'DT_FLOAT': mace_pb2.DT_FLOAT} +device_type_map = {'cpu': mace_pb2.CPU, + 'gpu': mace_pb2.GPU, + 'dsp': mace_pb2.HEXAGON} + def file_checksum(fname): hash_func = hashlib.sha256() @@ -34,6 +49,10 @@ def file_checksum(fname): return hash_func.hexdigest() +def parse_int_array_from_str(ints_str): + return [int(int_str) for int_str in ints_str.split(',')] + + def main(unused_args): if not os.path.isfile(FLAGS.model_file): print("Input graph file '" + FLAGS.model_file + "' does not exist!") @@ -59,27 +78,64 @@ def main(unused_args): (weight_checksum, FLAGS.weight_checksum)) sys.exit(-1) - if FLAGS.runtime == 'dsp': - print("DSP not support caffe model yet.") - sys.exit(-1) + if FLAGS.platform not in ['tensorflow', 'caffe']: + print ("platform %s is not supported." % FLAGS.platform) + sys.exit(-1) + if FLAGS.runtime not in ['cpu', 'gpu', 'dsp']: + print ("runtime %s is not supported." % FLAGS.runtime) + sys.exit(-1) - from mace.python.tools import caffe_converter_lib - output_graph_def = caffe_converter_lib.convert_to_mace_pb( - FLAGS.model_file, FLAGS.weight_file, FLAGS.input_node, - FLAGS.input_shape, FLAGS.output_node, FLAGS.data_type, - FLAGS.runtime, FLAGS.winograd) - elif FLAGS.platform == 'tensorflow': - if FLAGS.runtime == 'dsp': - from mace.python.tools import tf_dsp_converter_lib + if FLAGS.runtime == 'dsp': + if FLAGS.platform == 'tensorflow': output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb( FLAGS.model_file, FLAGS.input_node, FLAGS.output_node, FLAGS.dsp_mode) else: - from mace.python.tools import tf_converter_lib - output_graph_def = tf_converter_lib.convert_to_mace_pb( - FLAGS.model_file, FLAGS.input_node, FLAGS.input_shape, - FLAGS.output_node, FLAGS.data_type, FLAGS.runtime, - FLAGS.winograd) + print("%s does not support dsp runtime yet." % FLAGS.platform) + sys.exit(-1) + else: + option = cvt.ConverterOption() + option.data_type = data_type_map[FLAGS.data_type] + option.device = device_type_map[FLAGS.runtime] + option.winograd_enabled = bool(FLAGS.winograd) + + input_node_names = FLAGS.input_node.split(',') + input_node_shapes = FLAGS.input_shape.split(':') + if len(input_node_names) != len(input_node_shapes): + raise Exception('input node count and shape count do not match.') + for i in xrange(len(input_node_names)): + input_node = cvt.NodeInfo() + input_node.name = input_node_names[i] + input_node.shape = parse_int_array_from_str(FLAGS.input_shape) + option.add_input_node(input_node) + + output_node_names = FLAGS.output_node.split(',') + for i in xrange(len(output_node_names)): + output_node = cvt.NodeInfo() + output_node.name = output_node_names[i] + option.add_output_node(output_node) + + print("Convert model to mace model.") + if FLAGS.platform == 'tensorflow': + converter = tensorflow_converter.TensorflowConverter(option, + FLAGS.model_file) # noqa + elif FLAGS.platform == 'caffe': + converter = caffe_converter.CaffeConverter(option, + FLAGS.model_file, + FLAGS.weight_file) + + output_graph_def = converter.run() + print("Transform model to one that can better run on device.") + # TODO(liuqi/liyin): transform gpu/cpu and merge their ops + mace_transformer = transformer.Transformer(option, output_graph_def) + output_graph_def = mace_transformer.run() + + print "start optimize memory." + if FLAGS.runtime == 'gpu': + memory_optimizer.optimize_gpu_memory(output_graph_def) + elif FLAGS.runtime == 'cpu': + memory_optimizer.optimize_cpu_memory(output_graph_def) + print "Memory optimization done." if FLAGS.output_type == 'source': source_converter_lib.convert_to_source( diff --git a/mace/python/tools/converter_tool/__init__.py b/mace/python/tools/converter_tool/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py new file mode 100644 index 00000000..7e1039b9 --- /dev/null +++ b/mace/python/tools/converter_tool/base_converter.py @@ -0,0 +1,259 @@ +from enum import Enum + +from mace.proto import mace_pb2 + + +class DataFormat(Enum): + NHWC = 0 + NCHW = 1 + + +class FilterFormat(Enum): + HWIO = 0 + OIHW = 1 + HWOI = 2 + + +class PaddingMode(Enum): + VALID = 0 + SAME = 1 + FULL = 2 + + +class PoolingType(Enum): + AVG = 1 + MAX = 2 + + +class ActivationType(Enum): + NOOP = 0 + RELU = 1 + RELUX = 2 + PRELU = 3 + TANH = 4 + SIGMOID = 5 + + +class EltwiseType(Enum): + SUM = 0 + SUB = 1 + PROD = 2 + DIV = 3 + MIN = 4 + MAX = 5 + NEG = 6 + ABS = 7 + SQR_DIFF = 8 + POW = 9 + + +MaceSupportedOps = [ + 'Activation', + 'AddN', + 'BatchNorm', + 'BatchToSpaceND', + 'BiasAdd', + 'ChannelShuffle', + 'Concat', + 'Conv2D', + 'Deconv2D', + 'DepthToSpace', + 'DepthwiseConv2d', + 'Dequantize', + 'Eltwise', + 'FoldedBatchNorm', + 'FullyConnected', + 'LocalResponseNorm', + 'MatMul', + 'Pad', + 'Pooling', + 'Proposal', + 'PSROIAlign', + 'Quantize', + 'Requantize', + 'Reshape', + 'ResizeBilinear', + 'Slice', + 'Softmax', + 'SpaceToBatchND', + 'SpaceToDepth', + 'Transpose', + 'WinogradInverseTransform', + 'WinogradTransform', +] + +MaceOp = Enum('MaceOp', [(op, op) for op in MaceSupportedOps], type=str) + + +class MaceKeyword(object): + # node related str + mace_input_node_name = 'mace_input_node' + mace_output_node_name = 'mace_output_node' + mace_buffer_type = 'buffer_type' + mace_mode = 'mode' + mace_buffer_to_image = 'BufferToImage' + mace_image_to_buffer = 'ImageToBuffer' + # arg related str + mace_padding_str = 'padding' + mace_padding_values_str = 'padding_values' + mace_strides_str = 'strides' + mace_dilations_str = 'dilations' + mace_pooling_type_str = 'pooling_type' + mace_global_pooling_str = 'global_pooling' + mace_kernel_str = 'kernels' + mace_data_format_str = 'data_format' + mace_filter_format_str = 'filter_format' + mace_element_type_str = 'type' + mace_activation_type_str = 'activation' + mace_activation_max_limit_str = 'max_limit' + mace_resize_size_str = 'size' + mace_batch_to_space_crops_str = 'crops' + mace_paddings_str = 'paddings' + mace_align_corners_str = 'align_corners' + mace_space_batch_block_shape_str = 'block_shape' + mace_space_depth_block_size_str = 'block_size' + mace_constant_value_str = 'constant_value' + mace_dims_str = 'dims' + mace_axis_str = 'axis' + mace_shape_str = 'shape' + mace_winograd_filter_transformed = 'is_filter_transformed' + + +class ConverterInterface(object): + """Base class for converting external models to mace models.""" + + def run(self): + raise NotImplementedError('run') + + +class NodeInfo(object): + """A class for describing node information""" + + def __init__(self): + self._name = None + self._shape = [] + + @property + def name(self): + return self._name + + @property + def shape(self): + return self._shape + + @name.setter + def name(self, name): + self._name = name + + @shape.setter + def shape(self, shape): + self._shape = shape + + def __str__(self): + return '%s %s' % (self._name, str(self._shape)) + + +class ConverterOption(object): + """A class for specifying options passed to converter tool""" + + def __init__(self): + self._input_nodes = {} + self._output_nodes = {} + self._data_type = mace_pb2.DT_FLOAT + self._device = mace_pb2.CPU + self._winograd_enabled = False + + @property + def input_nodes(self): + return self._input_nodes + + @property + def output_nodes(self): + return self._output_nodes + + @property + def data_type(self): + return self._data_type + + @property + def device(self): + return self._device + + @property + def winograd_enabled(self): + return self._winograd_enabled + + @input_nodes.setter + def input_nodes(self, input_nodes): + for node in input_nodes: + self._input_nodes[node.name] = node + + def add_input_node(self, input_node): + self._input_nodes[input_node.name] = input_node + + @output_nodes.setter + def output_nodes(self, output_nodes): + for node in output_nodes: + self.output_nodes[node.name] = node + + def add_output_node(self, output_node): + self._output_nodes[output_node.name] = output_node + + @data_type.setter + def data_type(self, data_type): + self._data_type = data_type + + @device.setter + def device(self, device): + self._device = device + + @winograd_enabled.setter + def winograd_enabled(self, winograd_enabled): + self._winograd_enabled = winograd_enabled + + +class ConverterUtil(object): + @staticmethod + def get_arg(op, arg_name): + for arg in op.arg: + if arg.name == arg_name: + return arg + return None + + @staticmethod + def add_data_format_arg(op, data_format): + data_format_arg = op.arg.add() + data_format_arg.name = MaceKeyword.mace_data_format_str + data_format_arg.i = data_format.value + + @staticmethod + def data_format(op): + arg = ConverterUtil.get_arg(op, MaceKeyword.mace_data_format_str) + if arg is None: + return None + elif arg.i == DataFormat.NHWC.value: + return DataFormat.NHWC + elif arg.i == DataFormat.NCHW.value: + return DataFormat.NCHW + else: + return None + + @staticmethod + def set_filter_format(net, filter_format): + arg = net.arg.add() + arg.name = MaceKeyword.mace_filter_format_str + arg.i = filter_format.value + + @staticmethod + def filter_format(net): + arg = ConverterUtil.get_arg(net, MaceKeyword.mace_filter_format_str) + if arg is None: + return None + elif arg.i == FilterFormat.HWIO.value: + return FilterFormat.HWIO + elif arg.i == FilterFormat.HWOI.value: + return FilterFormat.HWOI + elif arg.i == FilterFormat.OIHW.value: + return FilterFormat.OIHW + else: + return None diff --git a/mace/python/tools/converter_tool/caffe_converter.py b/mace/python/tools/converter_tool/caffe_converter.py new file mode 100644 index 00000000..a0298bb1 --- /dev/null +++ b/mace/python/tools/converter_tool/caffe_converter.py @@ -0,0 +1,508 @@ +import math +import numpy as np +import google.protobuf.text_format + +from mace.proto import mace_pb2 +from mace.third_party.caffe import caffe_pb2 +from mace.python.tools.converter_tool import base_converter +from mace.python.tools.converter_tool import shape_inference +from mace.python.tools.converter_tool.base_converter import PoolingType +from mace.python.tools.converter_tool.base_converter import ActivationType +from mace.python.tools.converter_tool.base_converter import EltwiseType +from mace.python.tools.converter_tool.base_converter import DataFormat +from mace.python.tools.converter_tool.base_converter import FilterFormat +from mace.python.tools.converter_tool.base_converter import MaceOp +from mace.python.tools.converter_tool.base_converter import MaceKeyword +from mace.python.tools.converter_tool.base_converter import ConverterUtil +from mace.python.tools.convert_util import mace_check + +caffe_group_str = 'group' +caffe_kernel_h_str = 'kernel_h' +caffe_kernel_w_str = 'kernel_w' +caffe_stride_h_str = 'stride_h' +caffe_stride_w_str = 'stride_w' +caffe_pad_h_str = 'pad_h' +caffe_pad_w_str = 'pad_w' + + +class CaffeOperator(object): + """CaffeOperator merges and provides both layer and weights information. + Layer records caffe layer proto, while blobs records the weight data in + format of numpy ndarray. + """ + def __init__(self): + self._layer = None + self._blobs = None + + @property + def name(self): + return self._layer.name + + @property + def type(self): + return self._layer.type + + @property + def layer(self): + return self._layer + + @property + def blobs(self): + return self._blobs + + @layer.setter + def layer(self, layer): + self._layer = layer + + @blobs.setter + def blobs(self, blobs): + self._blobs = [self.blob_to_nparray(blob) for blob in blobs] + + def get_blob(self, index): + mace_check(index < len(self._blobs), "blob out of index") + return self._blobs[index] + + @staticmethod + def blob_to_nparray(blob): + if blob.num != 0: + return (np.asarray(blob.data, dtype=np.float32).reshape( + (blob.num, blob.channels, blob.height, blob.width))) + else: + return np.asarray(blob.data, dtype=np.float32).reshape( + blob.shape.dim) + + +class CaffeNet(object): + """CaffeNet contains caffe operations. Output of each layer has unique + name as we replace duplicated output name with unique one, while keep + mace input/output name which user specifies unchanged.""" + def __init__(self): + self._ops = {} + self._consumers = {} + # for in-place op, its input name is the same with output name, + # so we change the output name to an alias + self._alias_op_output_name = {} + self._used_op_output_name = set() + + @property + def ops(self): + return self._ops.values() + + def get_op(self, op_name): + return self._ops.get(op_name, None) + + def get_consumers(self, tensor_name): + return self._consumers.get(tensor_name, []) + + def add_layer(self, layer): + op = CaffeOperator() + op.layer = layer + self._ops[layer.name] = op + + # change op output name if it is an in-place op + layer.bottom[:] = [self._alias_op_output_name.get(layer_input, + layer_input) for + layer_input in layer.bottom][:] + for i in xrange(len(layer.top)): + old_name = layer.top[i] + if layer.type == 'Input': + new_name = old_name + else: + idx = 0 + new_name = old_name + '#' + str(idx) + while new_name in self._used_op_output_name: + idx += 1 + new_name = old_name + '#' + str(idx) + layer.top[i] = new_name + self._alias_op_output_name[old_name] = new_name + self._used_op_output_name.update([new_name]) + + for input_tensor in layer.bottom: + if input_tensor not in self._consumers: + self._consumers[input_tensor] = [] + self._consumers[input_tensor].append(op) + + def add_blob(self, weight): + if weight.name in self._ops: + op = self._ops[weight.name] + op.blobs = list(weight.blobs) + + +class CaffeConverter(base_converter.ConverterInterface): + """A class for convert caffe model to mace model.""" + + pooling_type_mode = { + caffe_pb2.PoolingParameter.AVE: PoolingType.AVG, + caffe_pb2.PoolingParameter.MAX: PoolingType.MAX + } + eltwise_type = { + caffe_pb2.EltwiseParameter.PROD: EltwiseType.PROD, + caffe_pb2.EltwiseParameter.SUM: EltwiseType.SUM, + caffe_pb2.EltwiseParameter.MAX: EltwiseType.MAX, + } + activation_type = { + 'ReLU': ActivationType.RELU, + 'PReLU': ActivationType.PRELU, + 'TanH': ActivationType.TANH, + } + + def __init__(self, option, src_model_file, src_weight_file): + self._op_converters = { + 'Input': self.convert_nop, + 'Convolution': self.convert_conv2d, + 'Eltwise': self.convert_elementwise, + 'Add': self.convert_add, + 'ReLU': self.convert_activation, + 'TanH': self.convert_activation, + 'Sigmoid': self.convert_activation, + 'PReLU': self.convert_activation, + 'Pooling': self.convert_pooling, + 'Concat': self.convert_concat, + 'Slice': self.convert_slice, + 'Softmax': self.convert_softmax, + 'InnerProduct': self.convert_fully_connected, + 'BatchNorm': self.convert_folded_batchnorm, + } + self._option = option + self._mace_net_def = mace_pb2.NetDef() + ConverterUtil.set_filter_format(self._mace_net_def, FilterFormat.OIHW) + self._caffe_net = CaffeNet() + self._caffe_layers = caffe_pb2.NetParameter() + caffe_weights = caffe_pb2.NetParameter() + + # parse prototxt + with open(src_model_file, 'rb') as f: + google.protobuf.text_format.Merge( + str(f.read()), self._caffe_layers) + self.filter_test_layers(self._caffe_layers) + for layer in self._caffe_layers.layer: + self._caffe_net.add_layer(layer) + + # parse model weight + with open(src_weight_file, 'rb') as f: + caffe_weights.ParseFromString(f.read()) + self.filter_test_layers(caffe_weights) + for weight in caffe_weights.layer: + self._caffe_net.add_blob(weight) + + self._skip_ops = [] + + def run(self): + self.convert_ops() + shape_inferer = shape_inference.ShapeInference( + self._mace_net_def, + self._option.input_nodes.values()) + shape_inferer.run() + self.replace_output_tensor_name() + return self._mace_net_def + + @staticmethod + def replace_input_name(ops, src_name, dst_name): + for op in ops: + for i in xrange(len(op.input)): + if op.input[i] == src_name: + op.input[i] = dst_name + + def replace_output_tensor_name(self): + consumers = {} + for op in self._mace_net_def.op: + for input_name in op.input: + if input_name not in consumers: + consumers[input_name] = [] + consumers[input_name].append(op) + + # replace the last op with same prefix name with the original top name + ops = [op for op in self._mace_net_def.op] + ops.reverse() + visited = set() + for op in ops: + for i in xrange(len(op.output)): + original_output_name = op.output[i].split('#')[0] + if original_output_name not in visited: + self.replace_input_name( + consumers.get(op.output[i], []), + op.output[i], + original_output_name) + op.output[i] = original_output_name + visited.update([original_output_name]) + + # if user set op name as output node, replace it with op name + for op in self._mace_net_def.op: + if op.name in self._option.output_nodes: + if len(op.output) > 0: + self.replace_input_name( + consumers.get(op.output[0], []), + op.output, + op.name) + op.output[0] = op.name + + @staticmethod + def filter_test_layers(layers): + phase_map = {0: 'train', 1: 'test'} + while True: + changed = False + for layer in layers.layer: + phase = 'test' + if len(layer.include): + phase = phase_map[layer.include[0].phase] + if len(layer.exclude): + phase = phase_map[layer.exclude[0].phase] + if phase != 'test' or layer.type == 'Dropout': + print ("Remove layer %s (%s)" % (layer.name, layer.type)) + layers.layer.remove(layer) + changed = True + break + if not changed: + break + + @staticmethod + def add_stride_pad_kernel_arg(param, op_def): + try: + if len(param.stride) > 1 or len(param.kernel_size) > 1 or len( + param.pad) > 1: + raise Exception( + 'Mace does not support multiple stride/kernel_size/pad') + stride = [param.stride[0], + param.stride[0]] if len(param.stride) else [1, 1] + pad = [param.pad[0] * 2, + param.pad[0] * 2] if len(param.pad) else [0, 0] + kernel = [param.kernel_size[0], param.kernel_size[0]] if len( + param.kernel_size) else [0, 0] + except TypeError: + stride = [param.stride, param.stride] + pad = [param.pad * 2, param.pad * 2] + kernel = [param.kernel_size, param.kernel_size] + + if param.HasField(caffe_stride_h_str) or param.HasField( + caffe_stride_w_str): + stride = [param.stride_h, param.stride_w] + if param.HasField(caffe_pad_h_str) or param.HasField(caffe_pad_w_str): + pad = [param.pad_h * 2, param.pad_w * 2] + + strides_arg = op_def.arg.add() + strides_arg.name = MaceKeyword.mace_strides_str + strides_arg.ints.extend(stride) + padding_arg = op_def.arg.add() + padding_arg.name = MaceKeyword.mace_padding_values_str + padding_arg.ints.extend(pad) + + if op_def.type == MaceOp.Pooling.name: + if param.HasField(caffe_kernel_h_str) or param.HasField( + caffe_kernel_w_str): + kernel = [param.kernel_h, param.kernel_w] + kernels_arg = op_def.arg.add() + kernels_arg.name = MaceKeyword.mace_kernel_str + kernels_arg.ints.extend(kernel) + if param.HasField('global_pooling'): + global_pooling_arg = op_def.arg.add() + global_pooling_arg.name = MaceKeyword.mace_global_pooling_str + global_pooling_arg.i = 1 + + def convert_ops(self): + for layer in self._caffe_layers.layer: + caffe_op = self._caffe_net.get_op(layer.name) + if caffe_op not in self._skip_ops: + mace_check(layer.type in self._op_converters, + "Mace does not support caffe op type %s yet" + % layer.type) + self._op_converters[layer.type](caffe_op) + + def add_tensor(self, name, shape, data_type, value): + tensor = self._mace_net_def.tensors.add() + tensor.name = name + tensor.dims.extend(list(shape)) + tensor.data_type = data_type + tensor.float_data.extend(value.flat) + + def convert_nop(self, layer): + pass + + def convert_general_op(self, caffe_op): + op = self._mace_net_def.op.add() + op.name = caffe_op.name + op.type = caffe_op.type + op.input.extend(caffe_op.layer.bottom) + op.output.extend(caffe_op.layer.top) + + data_type_arg = op.arg.add() + data_type_arg.name = 'T' + data_type_arg.i = self._option.data_type + + ConverterUtil.add_data_format_arg(op, DataFormat.NCHW) + + return op + + def convert_conv2d(self, caffe_op): + op = self.convert_general_op(caffe_op) + param = caffe_op.layer.convolution_param + is_depthwise = False + if param.HasField(caffe_group_str): + mace_check(param.group == caffe_op.blob[0].shape[1] and + caffe_op.blob[0].shape[0] == 1, + "Mace do not support group convolution yet") + is_depthwise = True + + if is_depthwise: + op.type = MaceOp.DepthwiseConv2d.name + else: + op.type = MaceOp.Conv2D.name + + self.add_stride_pad_kernel_arg(param, op) + # dilation is specific for convolution in caffe + dilations = [1, 1] + if len(param.dilation) > 0: + dilation_arg = op.arg.add() + dilation_arg.name = MaceKeyword.mace_dilations_str + if len(param.dilation) == 1: + dilations = [param.dilation[0], param.dilation[0]] + elif len(param.dilation) == 2: + dilations = [param.dilation[0], param.dilation[1]] + dilation_arg.ints.extend(dilations) + + filter_tensor_name = op.name + '_filter' + filter_data = caffe_op.blobs[0] + self.add_tensor(filter_tensor_name, filter_data.shape, + mace_pb2.DT_FLOAT, filter_data) + op.input.extend([filter_tensor_name]) + + if len(caffe_op.blobs) == 2: + bias_tensor_name = op.name + '_bias' + bias_data = caffe_op.blobs[1] + self.add_tensor(bias_tensor_name, bias_data.shape, + mace_pb2.DT_FLOAT, + bias_data) + op.input.extend([bias_tensor_name]) + + def convert_elementwise(self, caffe_op): + op = self.convert_general_op(caffe_op) + param = caffe_op.layer.eltwise_param + + op.type = MaceOp.Eltwise.name + type_arg = op.arg.add() + type_arg.name = MaceKeyword.mace_element_type_str + type_arg.i = self.eltwise_type[param.operation].value + if len(param.coeff) > 0: + coeff_arg = op.arg.add() + coeff_arg.name = 'coeff' + coeff_arg.floats.extend(list(param.coeff)) + + def convert_add(self, caffe_op): + op = self.convert_general_op(caffe_op) + op.type = MaceOp.AddN.name + + def convert_activation(self, caffe_op): + op = self.convert_general_op(caffe_op) + op.type = MaceOp.Activation.name + + type_arg = op.arg.add() + type_arg.name = MaceKeyword.mace_activation_type_str + type_arg.s = self.activation_type[caffe_op.type].name + + if caffe_op.type == 'PReLU': + alpha_tensor_name = caffe_op.name + '_alpha' + alpha_data = caffe_op.blobs[0] + self.add_tensor(alpha_tensor_name, alpha_data.shape, + mace_pb2.DT_FLOAT, alpha_data) + op.input.extend([alpha_tensor_name]) + + def convert_folded_batchnorm(self, caffe_op): + op = self.convert_general_op(caffe_op) + op.type = MaceOp.FoldedBatchNorm.name + + scale_op = None + for consumer in self._caffe_net.get_consumers(caffe_op.layer.top[0]): + if consumer.type == 'Scale': + scale_op = consumer + mace_check(scale_op is not None, "batchnorm is not followed by scale") + self._skip_ops.append(scale_op) + + epsilon_value = caffe_op.layer.batch_norm_param.eps + mace_check(caffe_op.blobs[2][0] != 0, "batchnorm scalar is zero") + mean_value = (1. / caffe_op.blobs[2][0]) * caffe_op.blobs[0] + var_value = (1. / caffe_op.blobs[2][0]) * caffe_op.blobs[1] + gamma_value = scale_op.blobs[0] + beta_value = np.zeros_like(mean_value) + if len(scale_op.blobs) == 2: + beta_value = scale_op.blobs[1] + + scale_value = ( + (1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) * + gamma_value).reshape(-1) + offset_value = ((-mean_value * scale_value) + beta_value).reshape(-1) + + input_names = [op.name + '_scale', op.name + '_offset'] + self.add_tensor(input_names[0], scale_value.shape, mace_pb2.DT_FLOAT, + scale_value) + self.add_tensor(input_names[1], offset_value.shape, mace_pb2.DT_FLOAT, + offset_value) + op.input.extend([name for name in input_names]) + op.output[:] = scale_op.layer.top[:] + + def convert_pooling(self, caffe_op): + op = self.convert_general_op(caffe_op) + param = caffe_op.layer.pooling_param + + op.type = MaceOp.Pooling.name + self.add_stride_pad_kernel_arg(param, op) + pooling_type_arg = op.arg.add() + pooling_type_arg.name = MaceKeyword.mace_pooling_type_str + pooling_type_arg.i = self.pooling_type_mode[param.pool].value + + def convert_softmax(self, caffe_op): + self.convert_general_op(caffe_op) + + def convert_concat(self, caffe_op): + op = self.convert_general_op(caffe_op) + param = caffe_op.layer.concat_param + op.type = MaceOp.Concat.name + + axis_arg = op.arg.add() + axis_arg.name = MaceKeyword.mace_axis_str + axis_arg.i = 1 + if param.HasField('axis'): + axis_arg.i = param.axis + elif param.HasField('concat_dim'): + axis_arg.i = param.concat_dim + mace_check(axis_arg.i == 1, "only support concat at channel dimension") + + def convert_slice(self, caffe_op): + op = self.convert_general_op(caffe_op) + op.type = MaceOp.Slice.name + + if caffe_op.layer.HasField('slice_param'): + param = caffe_op.layer.slice_param + mace_check(not param.HasField('axis') or param.axis == 1, + "Mace do not support slice with axis %d" % param.axis) + mace_check(len(param.slice_point) == 0, + "Mace do not support slice with slice_point") + axis_arg = op.arg.add() + axis_arg.name = MaceKeyword.mace_axis_str + axis_arg.i = 1 + + def convert_fully_connected(self, caffe_op): + op = self.convert_general_op(caffe_op) + param = caffe_op.layer.inner_product_param + op.type = MaceOp.FullyConnected.name + + mace_check(param.axis == 1 and not param.transpose, + "Do not support non-default axis and transpose") + mace_check(caffe_op.blobs[0].ndim in [2, 4], + "Unexpected fc weigth ndim.") + if caffe_op.blobs[0].ndim == 4: + mace_check(list(caffe_op.blobs[0].shape[:2]) == [1, 1], + "Do not support 4D weight with shape [1, 1, *, *]") + + weight_tensor_name = op.name + '_weight' + weight_data = caffe_op.blobs[0].reshape(param.num_output, -1) + self.add_tensor(weight_tensor_name, weight_data.shape, + mace_pb2.DT_FLOAT, + weight_data) + op.input.extend([weight_tensor_name]) + + if len(caffe_op.blobs) == 2: + bias_tensor_name = op.name + '_bias' + bias_data = caffe_op.blobs[1] + self.add_tensor(bias_tensor_name, bias_data.shape, + mace_pb2.DT_FLOAT, + bias_data) + op.input.extend([bias_tensor_name]) diff --git a/mace/python/tools/converter_tool/shape_inference.py b/mace/python/tools/converter_tool/shape_inference.py new file mode 100644 index 00000000..f6dfda11 --- /dev/null +++ b/mace/python/tools/converter_tool/shape_inference.py @@ -0,0 +1,149 @@ +import math +import numpy as np + +from mace.python.tools.converter_tool.transformer import Transformer +from mace.python.tools.converter_tool.base_converter import DataFormat +from mace.python.tools.converter_tool.base_converter import FilterFormat +from mace.python.tools.converter_tool.base_converter import MaceOp +from mace.python.tools.converter_tool.base_converter import MaceKeyword +from mace.python.tools.converter_tool.base_converter import ConverterUtil +from mace.python.tools.convert_util import mace_check + + +class ShapeInference(object): + """Currently we only use it to infer caffe shape, we use tensorflow engine + to infer tensorflow op shapes, since tensorflow has too many ops.""" + + def __init__(self, net, input_nodes): + self._op_shape_inference = { + MaceOp.Conv2D.name: self.infer_shape_conv_pool_shape, + MaceOp.Eltwise.name: self.infer_shape_general, + MaceOp.FoldedBatchNorm.name: self.infer_shape_general, + MaceOp.AddN.name: self.infer_shape_general, + MaceOp.Activation.name: self.infer_shape_general, + MaceOp.Pooling.name: self.infer_shape_conv_pool_shape, + MaceOp.Concat.name: self.infer_shape_concat, + MaceOp.Slice.name: self.infer_shape_slice, + MaceOp.Softmax.name: self.infer_shape_general, + MaceOp.FullyConnected.name: self.infer_shape_fully_connected, + } + + self._net = net + self._output_shape_cache = {} + for input_node in input_nodes: + input_shape = input_node.shape[:] + # transpose input from NCHW to NHWC + Transformer.transpose_shape(input_shape, [0, 3, 1, 2]) + self._output_shape_cache[input_node.name] = input_shape + for tensor in net.tensors: + self._output_shape_cache[tensor.name] = list(tensor.dims) + + def run(self): + for op in self._net.op: + mace_check(op.type in self._op_shape_inference, + "Mace does not support caffe op type %s yet" + % op.type) + self._op_shape_inference[op.type](op) + + def add_output_shape(self, op, shapes): + mace_check(len(op.output) == len(shapes), + "Op %s (%s) output count is different from " + "output shape count" % ( + op.name, op.type)) + for i in xrange(len(shapes)): + output_name = op.output[i] + output_shape = op.output_shape.add() + output_shape.dims.extend(shapes[i]) + self._output_shape_cache[output_name] = shapes[i] + + def infer_shape_general(self, op): + if len(op.input) > 0: + mace_check(op.input[0] in self._output_shape_cache, + "%s does not exist" % op.input[0]) + input_shape = self._output_shape_cache[op.input[0]] + self.add_output_shape(op, [input_shape]) + + def infer_shape_conv_pool_shape(self, op): + input_shape = self._output_shape_cache[op.input[0]] + output_shape = np.zeros_like(input_shape) + if op.type == MaceOp.Pooling: + filter_shape = list( + ConverterUtil.get_arg(op, MaceKeyword.mace_kernel_str).ints) + if ConverterUtil.data_format(op) == DataFormat.NCHW: + filter_shape = [input_shape[1], input_shape[1]] + filter_shape + if ConverterUtil.get_arg(op, + MaceKeyword.mace_global_pooling_str) \ + is not None: + filter_shape[2] = input_shape[2] + filter_shape[3] = input_shape[3] + else: # NHWC + filter_shape = filter_shape + [input_shape[1], input_shape[1]] + if ConverterUtil.get_arg(op, + MaceKeyword.mace_global_pooling_str) \ + is not None: + filter_shape[0] = input_shape[1] + filter_shape[1] = input_shape[2] + else: + filter_shape = self._output_shape_cache[op.input[1]] + + paddings = ConverterUtil.get_arg(op, + MaceKeyword.mace_padding_values_str).ints # noqa + strides = ConverterUtil.get_arg(op, MaceKeyword.mace_strides_str).ints + dilations_arg = ConverterUtil.get_arg(op, + MaceKeyword.mace_dilations_str) + if dilations_arg is not None: + dilations = dilations_arg.ints + else: + dilations = [1, 1] + if op.type == MaceOp.Pooling: + round_func = math.ceil + else: + round_func = math.floor + + output_shape[0] = input_shape[0] + if ConverterUtil.data_format(op) == DataFormat.NCHW \ + and ConverterUtil.filter_format(self._net) == FilterFormat.OIHW: # noqa + # filter format: OIHW + output_shape[1] = filter_shape[0] + output_shape[2] = int( + round_func((input_shape[2] + paddings[0] - filter_shape[2] - + (filter_shape[2] - 1) * + (dilations[0] - 1)) / float(strides[0]))) + 1 + output_shape[3] = int( + round_func((input_shape[3] + paddings[1] - filter_shape[3] - + (filter_shape[3] - 1) * + (dilations[1] - 1)) / float(strides[1]))) + 1 + else: + mace_check(False, + "Mace can only infer shape for" + " NCHW input and OIHW filter") + + self.add_output_shape(op, [output_shape]) + + def infer_shape_concat(self, op): + output_shape = self._output_shape_cache[op.input[0]] + axis = ConverterUtil.get_arg(op, MaceKeyword.mace_axis_str).i + for input_node in op.input: + input_shape = self._output_shape_cache[input_node] + output_shape[axis] += input_shape[axis] + + self.add_output_shape(op, [output_shape]) + + def infer_shape_slice(self, op): + output_shape = self._output_shape_cache[op.input[0]] + axis = ConverterUtil.get_arg(op, MaceKeyword.mace_axis_str).i + output_shape[axis] /= len(op.output) + output_shapes = [] + for _ in op.output: + output_shapes.append(output_shape) + self.add_output_shape(op, output_shapes) + + def infer_shape_fully_connected(self, op): + input_shape = self._output_shape_cache[op.input[0]] + weight_shape = self._output_shape_cache[op.input[1]] + if ConverterUtil.data_format(op) == DataFormat.NCHW: + output_shape = [input_shape[0], weight_shape[0], 1, 1] + else: + mace_check(False, "format %s is not supported" + % ConverterUtil.data_format(op)) + self.add_output_shape(op, [output_shape]) diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py new file mode 100644 index 00000000..c2c5b3d0 --- /dev/null +++ b/mace/python/tools/converter_tool/tensorflow_converter.py @@ -0,0 +1,442 @@ +import math +import numpy as np +import tensorflow as tf + +from mace.proto import mace_pb2 +from mace.python.tools.converter_tool import base_converter +from mace.python.tools.converter_tool.base_converter import PoolingType +from mace.python.tools.converter_tool.base_converter import PaddingMode +from mace.python.tools.converter_tool.base_converter import ActivationType +from mace.python.tools.converter_tool.base_converter import EltwiseType +from mace.python.tools.converter_tool.base_converter import DataFormat +from mace.python.tools.converter_tool.base_converter import FilterFormat +from mace.python.tools.converter_tool.base_converter import MaceOp +from mace.python.tools.converter_tool.base_converter import MaceKeyword +from mace.python.tools.converter_tool.base_converter import ConverterUtil +from mace.python.tools.convert_util import mace_check + +from tensorflow.core.framework import tensor_shape_pb2 + +tf_padding_str = 'padding' +tf_strides_str = 'strides' +tf_dilations_str = 'dilations' +tf_data_format_str = 'data_format' +tf_kernel_str = 'ksize' +tf_epsilon_str = 'epsilon' +tf_align_corners = 'align_corners' +tf_block_size = 'block_size' + + +class TensorflowConverter(base_converter.ConverterInterface): + """A class for convert tensorflow frozen model to mace model. + We use tensorflow engine to infer op output shapes, since they are of + too many types.""" + + padding_mode = { + 'VALID': PaddingMode.VALID, + 'SAME': PaddingMode.SAME, + 'FULL': PaddingMode.FULL + } + pooling_type_mode = { + 'AvgPool': PoolingType.AVG, + 'MaxPool': PoolingType.MAX + } + eltwise_type = { + 'Add': EltwiseType.SUM, + 'Sub': EltwiseType.SUB, + 'Mul': EltwiseType.PROD, + 'Div': EltwiseType.DIV, + 'Min': EltwiseType.MIN, + 'Max': EltwiseType.MAX, + 'Neg': EltwiseType.NEG, + 'Abs': EltwiseType.ABS, + 'RealDiv': EltwiseType.DIV, + 'SquaredDifference': EltwiseType.SQR_DIFF, + 'Pow': EltwiseType.POW + } + activation_type = { + 'Relu': ActivationType.RELU, + 'Relu6': ActivationType.RELUX, + 'Tanh': ActivationType.TANH, + 'Sigmoid': ActivationType.SIGMOID + } + + def __init__(self, option, src_model_file): + self._op_converters = { + 'Conv2D': self.convert_conv2d, + 'DepthwiseConv2dNative': self.convert_conv2d, + 'Conv2DBackpropInput': self.convert_conv2d, + 'BiasAdd': self.convert_biasadd, + 'Add': self.convert_add, + 'Sub': self.convert_elementwise, + 'Mul': self.convert_elementwise, + 'Div': self.convert_elementwise, + 'Min': self.convert_elementwise, + 'Max': self.convert_elementwise, + 'Neg': self.convert_elementwise, + 'Abs': self.convert_elementwise, + 'RealDiv': self.convert_elementwise, + 'SquaredDifference': self.convert_elementwise, + 'Pow': self.convert_elementwise, + 'Relu': self.convert_activation, + 'Relu6': self.convert_activation, + 'Tanh': self.convert_activation, + 'Sigmoid': self.convert_activation, + 'FusedBatchNorm': self.convert_fused_batchnorm, + 'AvgPool': self.convert_pooling, + 'MaxPool': self.convert_pooling, + 'Squeeze': self.convert_identity, + 'Reshape': self.convert_reshape, + 'Shape': self.convert_nop, + 'Softmax': self.convert_softmax, + 'ResizeBilinear': self.convert_resize_bilinear, + 'Placeholder': self.convert_nop, + 'SpaceToBatchND': self.convert_space_batch, + 'BatchToSpaceND': self.convert_space_batch, + 'DepthToSpace': self.convert_space_depth, + 'SpaceToDepth': self.convert_space_depth, + 'Pad': self.convert_pad, + 'ConcatV2': self.convert_concat, + 'Mean': self.convert_mean, + # Const converter_tool should be placed at the end + 'Const': self.convert_tensor, + } + self._option = option + self._mace_net_def = mace_pb2.NetDef() + ConverterUtil.set_filter_format(self._mace_net_def, FilterFormat.HWIO) + tf_graph_def = tf.GraphDef() + with tf.gfile.Open(src_model_file, 'rb') as f: + tf_graph_def.ParseFromString(f.read()) + self.add_shape_info(tf_graph_def) + + with tf.Session() as session: + with session.graph.as_default() as graph: + tf.import_graph_def(tf_graph_def, name='') + self._tf_graph = graph + + self._skip_tensor = set() + + def run(self): + with tf.Session() as session: + self.convert_ops() + + self.replace_input_output_tensor_name() + return self._mace_net_def + + def replace_input_output_tensor_name(self): + for op in self._mace_net_def.op: + for i in xrange(len(op.input)): + if op.input[i][-2:] == ':0': + op_name = op.input[i][:-2] + if op_name in self._option.input_nodes: + op.input[i] = op_name + for i in xrange(len(op.output)): + if op.output[i][-2:] == ':0': + op_name = op.output[i][:-2] + if op_name in self._option.output_nodes: + op.output[i] = op_name + + def add_shape_info(self, tf_graph_def): + for node in tf_graph_def.node: + if node.name in self._option.input_nodes: + del node.attr['shape'].shape.dim[:] + node.attr['shape'].shape.dim.extend([ + tensor_shape_pb2.TensorShapeProto.Dim(size=i) for i in + self._option.input_nodes[node.name].shape + ]) + + @staticmethod + def get_scope(tensor_name): + idx = tensor_name.rfind('/') + if idx == -1: + return tensor_name + else: + return tensor_name[:idx] + + def convert_ops(self): + for tf_op in self._tf_graph.get_operations(): + mace_check(tf_op.type in self._op_converters, + "Mace does not support tensorflow op type %s yet" + % tf_op.type) + self._op_converters[tf_op.type](tf_op) + + def convert_tensor(self, tf_op): + output_name = tf_op.outputs[0].name + if output_name not in self._skip_tensor: + tensor = self._mace_net_def.tensors.add() + tensor.name = tf_op.outputs[0].name + tf_tensor = tf_op.outputs[0].eval() + tensor.dims.extend(list(tf_tensor.shape)) + + tf_dt = tf_op.get_attr('dtype') + if tf_dt == tf.float32: + tensor.data_type = mace_pb2.DT_FLOAT + tensor.float_data.extend(tf_tensor.astype(np.float32).flat) + elif tf_dt == tf.int32: + tensor.data_type = mace_pb2.DT_INT32 + tensor.int32_data.extend(tf_tensor.astype(np.int32).flat) + else: + mace_check(False, "Not supported tensor type: %s" % tf_dt.name) + + def add_tensor(self, name, shape, data_type, value): + tensor = self._mace_net_def.tensors.add() + tensor.name = name + tensor.dims.extend(list(shape)) + tensor.data_type = data_type + tensor.float_data.extend(value.flat) + + def convert_nop(self, tf_op): + pass + + def convert_general_op(self, tf_op): + op = self._mace_net_def.op.add() + op.name = tf_op.name + op.type = tf_op.type + op.input.extend([tf_input.name for tf_input in tf_op.inputs]) + op.output.extend([tf_output.name for tf_output in tf_op.outputs]) + for tf_output in tf_op.outputs: + output_shape = op.output_shape.add() + output_shape.dims.extend(tf_output.shape.as_list()) + op.output_type.append(self._option.data_type) + + data_type_arg = op.arg.add() + data_type_arg.name = 'T' + data_type_arg.i = self._option.data_type + + ConverterUtil.add_data_format_arg(op, DataFormat.NHWC) + + return op + + def convert_identity(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = 'Identity' + + def convert_conv2d(self, tf_op): + op = self.convert_general_op(tf_op) + if tf_op.type == 'DepthwiseConv2dNative': + op.type = MaceOp.DepthwiseConv2d.name + elif tf_op.type == 'Conv2DBackpropInput': + op.type = MaceOp.Deconv2D.name + else: + op.type = MaceOp.Conv2D.name + + padding_arg = op.arg.add() + padding_arg.name = MaceKeyword.mace_padding_str + padding_arg.i = self.padding_mode[tf_op.get_attr(tf_padding_str)].value + strides_arg = op.arg.add() + strides_arg.name = MaceKeyword.mace_strides_str + strides_arg.ints.extend(tf_op.get_attr(tf_strides_str)[1:3]) + if op.type != MaceOp.Deconv2D.name: + dilation_arg = op.arg.add() + dilation_arg.name = MaceKeyword.mace_dilations_str + dilation_arg.ints.extend(tf_op.get_attr(tf_dilations_str)[1:3]) + + def convert_elementwise(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.Eltwise.name + + type_arg = op.arg.add() + type_arg.name = MaceKeyword.mace_element_type_str + type_arg.i = self.eltwise_type[tf_op.type].value + + def convert_biasadd(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.BiasAdd.name + + def convert_add(self, tf_op): + if len(tf_op.inputs) == 2: + self.convert_elementwise(tf_op) + else: + op = self.convert_general_op(tf_op) + op.type = MaceOp.AddN.name + + def convert_activation(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.Activation.name + + type_arg = op.arg.add() + type_arg.name = MaceKeyword.mace_activation_type_str + type_arg.s = self.activation_type[tf_op.type].name + + if tf_op.type == 'Relu6': + limit_arg = op.arg.add() + limit_arg.name = MaceKeyword.mace_activation_max_limit_str + limit_arg.f = 6.0 + + def convert_fused_batchnorm(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.FoldedBatchNorm.name + + gamma_value = tf_op.inputs[1].eval().astype(np.float32) + beta_value = tf_op.inputs[2].eval().astype(np.float32) + mean_value = tf_op.inputs[3].eval().astype(np.float32) + var_value = tf_op.inputs[4].eval().astype(np.float32) + epsilon_value = tf_op.get_attr(tf_epsilon_str) + + scale_name = self.get_scope(tf_op.name) + '/scale:0' + offset_name = self.get_scope(tf_op.name) + '/offset:0' + scale_value = ( + (1.0 / np.vectorize(math.sqrt)( + var_value + epsilon_value)) * gamma_value) + offset_value = (-mean_value * scale_value) + beta_value + self.add_tensor(scale_name, scale_value.shape, mace_pb2.DT_FLOAT, + scale_value) + self.add_tensor(offset_name, offset_value.shape, mace_pb2.DT_FLOAT, + offset_value) + self._skip_tensor.update([inp.name for inp in tf_op.inputs][1:]) + + del op.input[1:] + op.input.extend([scale_name, offset_name]) + del op.output[1:] + del op.output_shape[1:] + del op.output_type[1:] + + def convert_pooling(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.Pooling.name + pooling_type_arg = op.arg.add() + pooling_type_arg.name = MaceKeyword.mace_pooling_type_str + pooling_type_arg.i = self.pooling_type_mode[tf_op.type].value + padding_arg = op.arg.add() + padding_arg.name = MaceKeyword.mace_padding_str + padding_arg.i = self.padding_mode[tf_op.get_attr(tf_padding_str)].value + strides_arg = op.arg.add() + strides_arg.name = MaceKeyword.mace_strides_str + strides_arg.ints.extend(tf_op.get_attr(tf_strides_str)[1:3]) + kernels_arg = op.arg.add() + kernels_arg.name = MaceKeyword.mace_kernel_str + kernels_arg.ints.extend(tf_op.get_attr(tf_kernel_str)[1:3]) + + def convert_softmax(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.Softmax.name + + def convert_resize_bilinear(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.ResizeBilinear.name + del op.input[1:] + + size_arg = op.arg.add() + size_arg.name = MaceKeyword.mace_resize_size_str + size_value = tf_op.inputs[1].eval().astype(np.int32) + size_arg.ints.extend(size_value) + self._skip_tensor.update(tf_op.inputs[1].name) + align_corners_arg = op.arg.add() + align_corners_arg.name = MaceKeyword.mace_align_corners_str + align_corners_arg.i = tf_op.get_attr(tf_align_corners) + + def convert_space_batch(self, tf_op): + print """You might want to try 'flatten_atrous_conv' in + transform graph to turn atrous conv2d into regular conv2d. + This may give you performance benefit on GPU. + (see https://github.com/tensorflow/tensorflow/blob/master/ + tensorflow/tools/graph_transforms/README.md#flatten_atrous_conv) + """ + + op = self.convert_general_op(tf_op) + del op.input[1:] + + size_arg = op.arg.add() + size_arg.name = MaceKeyword.mace_space_batch_block_shape_str + size_value = tf_op.inputs[1].eval().astype(np.int32) + size_arg.ints.extend(size_value) + + crops_or_paddings_arg = op.arg.add() + if op.type == 'BatchToSpaceND': + op.type = MaceOp.BatchToSpaceND.name + crops_or_paddings_arg.name = \ + MaceKeyword.mace_batch_to_space_crops_str + else: + op.type = MaceOp.SpaceToBatchND.name + crops_or_paddings_arg.name = MaceKeyword.mace_paddings_str + crops_or_paddings_value = tf_op.inputs[2].eval().astype(np.int32).flat + crops_or_paddings_arg.ints.extend(crops_or_paddings_value) + + self._skip_tensor.update(tf_op.inputs[1].name) + self._skip_tensor.update(tf_op.inputs[2].name) + + def convert_space_depth(self, tf_op): + op = self.convert_general_op(tf_op) + if op.type == 'SpaceToDepth': + op.type = MaceOp.SpaceToDepth.name + else: + op.type = MaceOp.DepthToSpace.name + + size_arg = op.arg.add() + size_arg.name = MaceKeyword.mace_space_depth_block_size_str + size_arg.i = tf_op.get_attr(tf_block_size) + + def convert_pad(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.Pad.name + del op.input[1:] + + paddings_arg = op.arg.add() + paddings_arg.name = MaceKeyword.mace_paddings_str + paddings_value = tf_op.inputs[1].eval().astype(np.int32).flat + paddings_arg.ints.extend(paddings_value) + self._skip_tensor.update(tf_op.inputs[1].name) + + if len(tf_op.inputs) == 3: + constant_value_arg = op.arg.add() + constant_value_arg.name = MaceKeyword.mace_constant_value_str + constant_value = tf_op.inputs[2].eval().astype(np.int32).flat[0] + constant_value_arg.i = constant_value + self._skip_tensor.update(tf_op.inputs[2].name) + + def convert_concat(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.Concat.name + del op.input[-1] + + axis_arg = op.arg.add() + axis_arg.name = MaceKeyword.mace_axis_str + axis = tf_op.inputs[-1].eval().astype(np.int32) + axis_arg.i = axis + + mace_check(axis == 3, "only support concat at channel dimension") + + self._skip_tensor.update(tf_op.inputs[-1].name) + + def convert_reshape(self, tf_op): + op = self.convert_general_op(tf_op) + op.type = MaceOp.Reshape.name + del op.input[1:] + + shape_arg = op.arg.add() + shape_arg.name = MaceKeyword.mace_shape_str + shape_value = [] + if tf_op.inputs[1].op.type == 'Const': + shape_value = list(tf_op.inputs[1].eval().astype(np.int32)) + for i in xrange(len(shape_value)): + if shape_value[i] == -1: + shape_value[i] = 1 + self._skip_tensor.update(tf_op.inputs[-1].name) + elif tf_op.inputs[1].op.type == 'Shape': + shape_value = list(tf_op.inputs[1].op.inputs[0].shape.as_list()) + + shape_arg.ints.extend(shape_value) + + def convert_mean(self, tf_op): + op = self.convert_general_op(tf_op) + del op.input[1:] + + reduce_dims = tf_op.inputs[1].eval() + mace_check(reduce_dims[0] == 1 and reduce_dims[1] == 2, + "Mean only support reduce dim 1, 2") + + op.type = MaceOp.Pooling.name + pooling_type_arg = op.arg.add() + pooling_type_arg.name = MaceKeyword.mace_pooling_type_str + pooling_type_arg.i = PoolingType.AVG.value + padding_arg = op.arg.add() + padding_arg.name = MaceKeyword.mace_padding_str + padding_arg.i = PaddingMode.VALID.value + strides_arg = op.arg.add() + strides_arg.name = MaceKeyword.mace_strides_str + strides_arg.ints.extend([1, 1]) + kernels_arg = op.arg.add() + kernels_arg.name = MaceKeyword.mace_kernel_str + kernels_arg.ints.extend(tf_op.inputs[0].shape.as_list()[1:3]) + + self._skip_tensor.add(tf_op.inputs[1].name) diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py new file mode 100644 index 00000000..adfe3e0c --- /dev/null +++ b/mace/python/tools/converter_tool/transformer.py @@ -0,0 +1,914 @@ +import enum +import numpy as np + +from mace.proto import mace_pb2 +from mace.python.tools.converter_tool import base_converter +from mace.python.tools.converter_tool.base_converter import EltwiseType +from mace.python.tools.converter_tool.base_converter import ActivationType +from mace.python.tools.converter_tool.base_converter import PaddingMode +from mace.python.tools.converter_tool.base_converter import DataFormat +from mace.python.tools.converter_tool.base_converter import FilterFormat +from mace.python.tools.converter_tool.base_converter import MaceOp +from mace.python.tools.converter_tool.base_converter import MaceKeyword +from mace.python.tools.converter_tool.base_converter import ConverterUtil +from mace.python.tools.convert_util import mace_check + +OPENCL_IMAGE_MAX_SIZE = 16384 + + +class OpenCLBufferType(enum.Enum): + CONV2D_FILTER = 0 + IN_OUT_CHANNEL = 1 + ARGUMENT = 2 + IN_OUT_HEIGHT = 3 + IN_OUT_WIDTH = 4 + WINOGRAD_FILTER = 5 + DW_CONV2D_FILTER = 6 + WEIGHT_HEIGHT = 7 + WEIGHT_WIDTH = 8 + + +class Transformer(base_converter.ConverterInterface): + """A class for transform naive mace model to optimized model. + This Transformer should be platform irrelevant. So, do not assume + tensor name has suffix like ':0". + """ + + def __init__(self, option, model): + # DO NOT reorder the following transformers + self._registered_transformers = [ + self.remove_identity_op, + self.transform_global_pooling, + self.fold_softmax, + self.fold_batchnorm, + self.fold_conv_and_bn, # data_format related + self.fold_depthwise_conv_and_bn, # data_format related + self.transform_gpu_winograd, # data_format related + self.transform_add_to_biasadd, + self.fold_biasadd, + self.fold_activation, + self.transpose_filters, + self.transpose_data_format, + self.transform_global_conv_to_fc, + self.transform_buffer_image, + self.sort_by_execution, + ] + + self._option = option + self._model = model + + self._ops = {} + self._consts = {} + self._consumers = {} + self._producer = {} + self._target_data_format = DataFormat.NHWC + + if self._option.device == mace_pb2.CPU: + self._target_data_format = DataFormat.NCHW + + def run(self): + for transformer in self._registered_transformers: + while True: + self.construct_ops_and_consumers() + changed = transformer() + if not changed: + break + + return self._model + + def filter_format(self): + filter_format_value = ConverterUtil.get_arg(self._model, + MaceKeyword.mace_filter_format_str).i # noqa + filter_format = None + if filter_format_value == FilterFormat.HWIO.value: + filter_format = FilterFormat.HWIO + elif filter_format_value == FilterFormat.OIHW.value: + filter_format = FilterFormat.OIHW + elif filter_format_value == FilterFormat.HWOI.value: + filter_format = FilterFormat.HWOI + else: + mace_check(False, "filter format %d not supported" % + filter_format_value) + + return filter_format + + def set_filter_format(self, filter_format): + arg = ConverterUtil.get_arg(self._model, + MaceKeyword.mace_filter_format_str) + arg.i = filter_format.value + + def construct_ops_and_consumers(self): + self._ops.clear() + self._consumers.clear() + self._producer.clear() + for op in self._model.op: + self._ops[op.name] = op + for tensor in self._model.tensors: + self._consts[tensor.name] = tensor + for op in self._ops.values(): + for input_tensor in op.input: + if input_tensor not in self._consumers: + self._consumers[input_tensor] = [] + self._consumers[input_tensor].append(op) + + for output_tensor in op.output: + self._producer[output_tensor] = op + for input_node in self._option.input_nodes.values(): + op = mace_pb2.OperatorDef() + op.name = self.normalize_op_name(input_node.name) + op.type = 'Input' + op.output.extend(input_node.name) + output_shape = op.output_shape.add() + output_shape.dims.extend(input_node.shape) + if self._option.device == mace_pb2.CPU: + self.transpose_shape(output_shape.dims, [0, 3, 1, 2]) + ConverterUtil.add_data_format_arg(op, DataFormat.NCHW) + else: + ConverterUtil.add_data_format_arg(op, DataFormat.NHWC) + self._producer[op.output[0]] = op + + @staticmethod + def replace(obj_list, source, target): + for i in xrange(len(obj_list)): + if obj_list[i] == source: + obj_list[i] = target + + @staticmethod + def transpose_shape(shape, order): + transposed_shape = [] + for i in xrange(len(order)): + transposed_shape.append(shape[order[i]]) + shape[:] = transposed_shape[:] + + @staticmethod + def normalize_op_name(name): + return name.replace(':', '_') + + def consumer_count(self, tensor_name): + return len(self._consumers.get(tensor_name, [])) + + def is_op_output_node(self, op): + output_node_tensor_names = [out for out in + self._option.output_nodes] + for output in op.output: + if output in output_node_tensor_names: + return True + + return False + + def replace_output_node(self, op): + """if it is an output node, change output node to the op before it""" + if self.is_op_output_node(op): + real_output_node = self._producer[op.input[0]] + self.replace(real_output_node.output, op.input[0], op.output[0]) + print("change %s to %s" % (real_output_node.name, op.name)) + + def remove_identity_op(self): + net = self._model + for op in net.op: + if op.type == 'Identity': + print("Remove identity: %s(%s)" % (op.name, op.type)) + for consumer_op in self._consumers.get(op.output[0], []): + Transformer.replace(consumer_op.input, op.output[0], + op.input[0]) + self.replace_output_node(op) + net.op.remove(op) + return True + + return False + + def transform_global_pooling(self): + net = self._model + for op in net.op: + if op.type == MaceOp.Pooling.name and \ + ConverterUtil.get_arg(op, + MaceKeyword.mace_global_pooling_str) is not None: # noqa + print("Transform global pooling: %s(%s)" % (op.name, op.type)) + input_shape = self._producer[op.input[0]].output_shape[0].dims + if ConverterUtil.data_format(op) == DataFormat.NHWC: + kernel_shape = input_shape[1:3] + else: + kernel_shape = input_shape[2:4] + ConverterUtil.get_arg(op, + MaceKeyword.mace_kernel_str).ints[:] \ + = kernel_shape[:] + + return False + + def fold_batchnorm(self): + net = self._model + for op in net.op: + if (op.type == MaceOp.Eltwise.name + and ConverterUtil.get_arg( + op, MaceKeyword.mace_element_type_str).i + == EltwiseType.PROD.value) \ + and len(op.input) == 2 \ + and op.input[1] in self._consts \ + and self.consumer_count(op.output[0]) == 1 \ + and not self.is_op_output_node(op): + consumer_op = self._consumers[op.output[0]][0] + if (consumer_op.type == MaceOp.Eltwise.name + and ConverterUtil.get_arg( + op, MaceKeyword.mace_element_type_str).i + == EltwiseType.SUM.value + or consumer_op.type == MaceOp.BiasAdd.name) \ + and len(consumer_op.input) == 2 \ + and consumer_op.input[1] in self._consts \ + and len(self._consts[consumer_op.input[1]].dims) == 1: + print("Fold batchnorm: %s(%s)" % (op.name, op.type)) + consumer_op.type = MaceOp.FoldedBatchNorm.name + inputs = [op.input[0], op.input[1], consumer_op.input[1]] + consumer_op.input[:] = inputs[:] + + net.op.remove(op) + return True + + return False + + def fold_conv_and_bn(self): + net = self._model + for op in net.op: + if (op.type == MaceOp.Conv2D.name + or op.type == MaceOp.Deconv2D.name) \ + and self.consumer_count(op.output[0]) == 1: + consumer_op = self._consumers[op.output[0]][0] + if consumer_op.type == MaceOp.FoldedBatchNorm.name: + print("Fold conv and bn: %s(%s)" % (op.name, op.type)) + filter = self._consts[op.input[1]] + scale = self._consts[consumer_op.input[1]] + idx = 0 + filter_format = self.filter_format() + if filter_format == FilterFormat.HWIO: + for hwi in xrange(filter.dims[0] * filter.dims[1] + * filter.dims[2]): + for o in xrange(filter.dims[3]): + filter.float_data[idx] *= scale.float_data[o] + idx += 1 + elif filter_format == FilterFormat.OIHW: + for o in xrange(filter.dims[0]): + for hwi in xrange(filter.dims[1] * filter.dims[2] + * filter.dims[3]): + filter.float_data[idx] *= scale.float_data[o] + idx += 1 + else: + mace_check(False, "filter format %s not supported" % + filter_format) + + # change BN to BiasAdd + consumer_op.type = MaceOp.BiasAdd.name + del consumer_op.input[1] + + # remove scale tensor + net.tensors.remove(scale) + return True + + return False + + def fold_depthwise_conv_and_bn(self): + net = self._model + for op in net.op: + if op.type == MaceOp.DepthwiseConv2d.name \ + and self.consumer_count(op.output[0]) == 1: + consumer_op = self._consumers[op.output[0]][0] + if consumer_op.type == MaceOp.FoldedBatchNorm.name: + print("Fold depthwise conv and bn: %s(%s)" + % (op.name, op.type)) + filter = self._consts[op.input[1]] + scale = self._consts[consumer_op.input[1]] + idx = 0 + + filter_format = self.filter_format() + if filter_format == FilterFormat.HWIO: + for hw in xrange(filter.dims[0] * filter.dims[1]): + for i in xrange(filter.dims[2]): + for o in xrange(filter.dims[3]): + filter.float_data[idx] *= scale.float_data[ + i * filter.dims[3] + o] + idx += 1 + elif filter_format == FilterFormat.OIHW: + for o in xrange(filter.dims[0]): + for i in xrange(filter.dims[1]): + for hw in xrange(filter.dims[2] + * filter.dims[3]): + filter.float_data[idx] *= scale.float_data[ + i * filter.dims[0] + o] + idx += 1 + else: + mace_check(False, "filter format %s not supported" % + filter_format) + + # change BN to BiasAdd + consumer_op.type = MaceOp.BiasAdd.name + del consumer_op.input[1] + + # remove scale tensor + net.tensors.remove(scale) + return True + + return False + + @staticmethod + def sort_feature_map_shape(shape, data_format): + """Return shape in NHWC order""" + batch = shape[0] + if data_format == DataFormat.NHWC: + height = shape[1] + width = shape[2] + channels = shape[3] + else: + height = shape[2] + width = shape[3] + channels = shape[1] + return batch, height, width, channels + + @staticmethod + def sort_filter_shape(filter_shape, filter_format): + """Return filter shape in HWIO order""" + if filter_format == FilterFormat.HWIO: + filter_height = filter_shape[0] + filter_width = filter_shape[1] + in_channels = filter_shape[2] + out_channels = filter_shape[3] + elif filter_format == FilterFormat.OIHW: + filter_height = filter_shape[2] + filter_width = filter_shape[3] + in_channels = filter_shape[1] + out_channels = filter_shape[0] + elif filter_format == FilterFormat.HWOI: + filter_height = filter_shape[0] + filter_width = filter_shape[1] + in_channels = filter_shape[3] + out_channels = filter_shape[2] + else: + mace_check(False, "filter format %s not supported" % filter_format) + return filter_height, filter_width, in_channels, out_channels + + def check_if_gpu_use_winograd_conv(self, op): + if not self._option.winograd_enabled: + return False + if op.type != MaceOp.Conv2D.name: + return False + + filter_shape = self._consts[op.input[1]].dims + output_shape = op.output_shape[0].dims + strides = ConverterUtil.get_arg(op, MaceKeyword.mace_strides_str).ints + dilations_arg = ConverterUtil.get_arg(op, + MaceKeyword.mace_dilations_str) + if dilations_arg is None: + dilations = [1, 1] + else: + dilations = dilations_arg.ints + filter_height, filter_width, in_channels, out_channels = \ + Transformer.sort_filter_shape(filter_shape, self.filter_format()) + batch, out_height, out_width, _ = Transformer.sort_feature_map_shape( + output_shape, ConverterUtil.data_format(op)) + + if filter_height != 3 or filter_width != 3 or strides[0] > 1 \ + or strides[1] > 1 or dilations[0] > 1 or dilations[1] > 1: + return False + width = batch * ((out_height + 1) / 2) * ((out_width + 1) / 2) + return (16 * in_channels < OPENCL_IMAGE_MAX_SIZE) and \ + (16 * out_channels < OPENCL_IMAGE_MAX_SIZE) and \ + (width < OPENCL_IMAGE_MAX_SIZE) + + def transform_gpu_winograd(self): + """Only gpu needs winograd transform.""" + net = self._model + filter_format = self.filter_format() + + if self._option.device == mace_pb2.GPU: + for op in net.op: + if op.type == MaceOp.Conv2D.name \ + and self.check_if_gpu_use_winograd_conv(op): + print("Transform gpu winograd %s(%s)" % (op.name, op.type)) + output_shape = op.output_shape[0].dims + filter = self._consts[op.input[1]] + filter_shape = filter.dims + data_format = ConverterUtil.data_format(op) + filter_height, filter_width, in_channels, out_channels = \ + Transformer.sort_filter_shape(filter_shape, + filter_format) + batch, out_height, out_width, _ = \ + Transformer.sort_feature_map_shape(output_shape, + data_format) + + # Input transform + wt_op = net.op.add() + wt_op.name = op.name + '_input_transform' + wt_op.type = MaceOp.WinogradTransform.name + wt_op.input.extend([op.input[0]]) + wt_op.output.extend([wt_op.name]) + wt_output_shape = wt_op.output_shape.add() + wt_output_width = batch * ( + (out_height + 1) / 2) * ((out_width + 1) / 2) + wt_output_shape.dims.extend( + [16, in_channels, wt_output_width, 1]) + + arg = wt_op.arg.add() + arg.name = 'T' + arg.i = self._option.data_type + + if ConverterUtil.get_arg(op, + MaceKeyword.mace_padding_str) \ + is not None: + padding_arg = wt_op.arg.add() + padding_arg.name = MaceKeyword.mace_padding_str + padding_arg.i = ConverterUtil.get_arg(op, + MaceKeyword.mace_padding_str).i # noqa + elif ConverterUtil.get_arg(op, + MaceKeyword.mace_padding_values_str) is not None: # noqa + padding_arg = wt_op.arg.add() + padding_arg.name = MaceKeyword.mace_padding_values_str + padding_arg.ints.extend(ConverterUtil.get_arg( + op, MaceKeyword.mace_padding_values_str).ints) + + # MatMul + matmul_op = net.op.add() + matmul_op.name = op.name + '_matmul' + matmul_op.type = MaceOp.MatMul.name + matmul_op.input.extend([op.input[1], wt_op.output[0]]) + matmul_op.output.extend([matmul_op.name]) + matmul_output_shape = matmul_op.output_shape.add() + matmul_output_shape.dims.extend( + [16, out_channels, wt_output_width, 1]) + + arg = matmul_op.arg.add() + arg.name = 'T' + arg.i = self._option.data_type + arg = matmul_op.arg.add() + arg.name = MaceKeyword.mace_winograd_filter_transformed + arg.i = 1 + + # Inverse transform + iwt_op = net.op.add() + iwt_op.name = op.name + '_inverse_transform' + iwt_op.type = MaceOp.WinogradInverseTransform.name + iwt_op.input.extend([matmul_op.output[0]]) + # biasadd + if len(op.input) >= 3: + iwt_op.input.extend([op.input[2]]) + iwt_op.output.extend(op.output) + iwt_output_shape = iwt_op.output_shape.add() + iwt_output_shape.dims.extend(op.output_shape[0].dims) + + arg = iwt_op.arg.add() + arg.name = 'T' + arg.i = self._option.data_type + batch_arg = iwt_op.arg.add() + batch_arg.name = 'batch' + batch_arg.i = batch + height_arg = iwt_op.arg.add() + height_arg.name = 'height' + height_arg.i = out_height + width_arg = iwt_op.arg.add() + width_arg.name = 'width' + width_arg.i = out_width + ConverterUtil.add_data_format_arg(iwt_op, data_format) + + filter_data = np.array(filter.float_data).reshape( + filter.dims) + + weight_tensor_value = filter_data + if filter_format == FilterFormat.HWIO: + weight_tensor_value = filter_data.transpose(3, 2, 0, 1) + elif filter_format == FilterFormat.HWOI: + weight_tensor_value = filter_data.transpose(2, 3, 0, 1) + filter.float_data[:] = weight_tensor_value.flat[:] + filter.dims[:] = weight_tensor_value.shape[:] + + net.op.remove(op) + + return False + + def transform_add_to_biasadd(self): + net = self._model + for op in net.op: + if op.type == 'Add' \ + and len(op.input) == 2 \ + and op.input[1] in self._consts \ + and len(self._consts[op.input[1]].dims) == 1: + print("Transform add to biasadd: %s(%s)" % (op.name, op.type)) + op.type = MaceOp.BiasAdd.name + return True + + return False + + def fold_biasadd(self): + net = self._model + for op in net.op: + if ((op.type == MaceOp.Conv2D.name + or op.type == MaceOp.Deconv2D.name + or op.type == MaceOp.DepthwiseConv2d.name + or op.type == MaceOp.FullyConnected.name + or op.type == MaceOp.WinogradInverseTransform.name) + and len(op.input) == 2) \ + and len(self._consumers.get(op.output[0], [])) == 1: + consumer_op = self._consumers[op.output[0]][0] + if consumer_op.type == MaceOp.BiasAdd.name: + print("Fold biasadd: %s(%s)" % (op.name, op.type)) + op.name = consumer_op.name + op.input.append(consumer_op.input[1]) + op.output[0] = consumer_op.output[0] + net.op.remove(consumer_op) + return True + + return False + + def fold_activation(self): + net = self._model + for op in net.op: + if (op.type == MaceOp.Conv2D.name + or op.type == MaceOp.Deconv2D.name + or op.type == MaceOp.DepthwiseConv2d.name + or op.type == MaceOp.FullyConnected.name + or op.type == MaceOp.FoldedBatchNorm.name + or op.type == MaceOp.WinogradInverseTransform.name) \ + and len(self._consumers.get(op.output[0], [])) == 1: + consumer_op = self._consumers[op.output[0]][0] + if consumer_op.type == MaceOp.Activation.name \ + and ConverterUtil.get_arg( + consumer_op, + MaceKeyword.mace_activation_type_str).s != 'PRELU': + print("Fold activation: %s(%s)" % (op.name, op.type)) + op.name = consumer_op.name + op.output[0] = consumer_op.output[0] + for arg in consumer_op.arg: + if arg.name == MaceKeyword.mace_activation_type_str \ + or arg.name == MaceKeyword.mace_activation_max_limit_str: # noqa + op.arg.extend([arg]) + + net.op.remove(consumer_op) + return True + + return False + + def transpose_data_format(self): + net = self._model + + for op in net.op: + # transpose args + if op.type == MaceOp.Pad.name: + for arg in op.arg: + if arg.name == MaceKeyword.mace_paddings_str and len( + arg.ints) == 4: + if ConverterUtil.data_format(op) == DataFormat.NHWC \ + and self._target_data_format == DataFormat.NCHW: # noqa + print("Transpose pad args: %s(%s)" + % (op.name, op.type)) + self.transpose_shape(arg.ints, [0, 3, 1, 2]) + elif ConverterUtil.data_format(op) == DataFormat.NCHW \ + and self._target_data_format == DataFormat.NHWC: # noqa + print("Transpose pad args: %s(%s)" + % (op.name, op.type)) + self.transpose_shape(arg.ints, [0, 2, 3, 1]) + elif op.type == MaceOp.Concat.name or op.type == MaceOp.Slice.name: + for arg in op.arg: + if arg.name == MaceKeyword.mace_axis_str: + if ConverterUtil.data_format(op) == DataFormat.NHWC \ + and self._target_data_format == DataFormat.NCHW: # noqa + print("Transpose slice args: %s(%s)" + % (op.name, op.type)) + mace_check(arg.i == 3, + 'only support concat at ' + 'channel dimension') + arg.i = 1 + elif ConverterUtil.data_format(op) == DataFormat.NCHW \ + and self._target_data_format == DataFormat.NHWC: # noqa + print("Transpose slice args: %s(%s)" + % (op.name, op.type)) + mace_check(arg.i == 1, + "only support concat at " + "channel dimension") + arg.i = 3 + + # transpose op output shape + data_format = ConverterUtil.data_format(op) + if data_format is not None \ + and data_format != self._target_data_format: + print("Transpose output shapes: %s(%s)" % (op.name, op.type)) + if self._target_data_format == DataFormat.NHWC: # NCHW -> NHWC + for output_shape in op.output_shape: + if len(output_shape.dims) == 4: + self.transpose_shape(output_shape.dims, + [0, 2, 3, 1]) + else: # NHWC -> NCHW + for output_shape in op.output_shape: + if len(output_shape.dims) == 4: + self.transpose_shape(output_shape.dims, + [0, 3, 1, 2]) + ConverterUtil.get_arg(op, + MaceKeyword.mace_data_format_str).i = \ + self._target_data_format.value + + # transpose input/output + if self._target_data_format == DataFormat.NCHW: + print("Transpose input/output to NCHW") + for input_node in self._option.input_nodes.values(): + new_input_name = MaceKeyword.mace_input_node_name \ + + '_' + input_node.name + op = net.op.add() + op.name = self.normalize_op_name(input_node.name) + op.type = MaceOp.Transpose.name + op.input.extend([new_input_name]) + op.output.extend([input_node.name]) + output_shape = op.output_shape.add() + output_shape.dims.extend(input_node.shape) + + dims_arg = op.arg.add() + dims_arg.name = MaceKeyword.mace_dims_str + dims_arg.ints.extend([0, 3, 1, 2]) + + arg = op.arg.add() + arg.name = 'T' + arg.i = self._option.data_type + + for output_node in self._option.output_nodes.values(): + output_name = MaceKeyword.mace_output_node_name \ + + '_' + output_node.name + op = self._model.op.add() + op.name = self.normalize_op_name(output_name) + op.type = MaceOp.Transpose.name + op.input.extend([output_node.name]) + op.output.extend([output_name]) + output_shape = op.output_shape.add() + output_shape.dims.extend( + self._producer[output_node.name].output_shape[0].dims) + self.transpose_shape(output_shape.dims, [0, 2, 3, 1]) + + dims_arg = op.arg.add() + dims_arg.name = MaceKeyword.mace_dims_str + dims_arg.ints.extend([0, 2, 3, 1]) + + arg = op.arg.add() + arg.name = 'T' + arg.i = self._option.data_type + + return False + + def transpose_filters(self): + net = self._model + filter_format = self.filter_format() + + # TODO(liyin/liuqi): remove this if-condition after combine cpu/gpu + if self._option.device == mace_pb2.CPU: + print("Transpose filters to OIHW") + # transpose filter to OIHW/MIHW for tensorflow (HWIO/HWIM) + if filter_format == FilterFormat.HWIO: + for op in net.op: + if op.type == MaceOp.Conv2D.name \ + or op.type == MaceOp.Deconv2D.name \ + or op.type == MaceOp.DepthwiseConv2d.name: + if ConverterUtil.get_arg(op, + MaceKeyword.mace_winograd_filter_transformed) is None: # noqa + filter = self._consts[op.input[1]] + filter_data = np.array(filter.float_data).reshape( + filter.dims) + filter_data = filter_data.transpose(3, 2, 0, 1) + filter.float_data[:] = filter_data.flat + filter.dims[:] = filter_data.shape + self.set_filter_format(FilterFormat.OIHW) + + elif self._option.device == mace_pb2.GPU: + # TODO(liyin/liuqi): remove this whole logic after combine cpu/gpu + print("Transpose filters to HWOI/HWIM") + for op in net.op: + if op.type == MaceOp.Conv2D.name \ + or op.type == MaceOp.Deconv2D.name \ + or op.type == MaceOp.DepthwiseConv2d.name: + filter = self._consts[op.input[1]] + filter_data = np.array(filter.float_data).reshape( + filter.dims) + # transpose filter to HWOI/HWIM for + # tensorflow and caffe (OIHW/MIHW) + if filter_format == FilterFormat.HWIO \ + and (op.type == MaceOp.Conv2D.name + or op.type == MaceOp.Deconv2D.name): + filter_data = filter_data.transpose(0, 1, 3, 2) + filter.float_data[:] = filter_data.flat + filter.dims[:] = filter_data.shape + elif filter_format == FilterFormat.OIHW: + if op.type == MaceOp.Conv2D.name \ + or op.type == MaceOp.Deconv2D.name: + filter_data = filter_data.transpose(2, 3, 0, 1) + filter.float_data[:] = filter_data.flat + filter.dims[:] = filter_data.shape + elif op.type == MaceOp.Depthwiseconv2d.name: + filter_data = filter_data.transpose(2, 3, 1, 0) + filter.float_data[:] = filter_data.flat + filter.dims[:] = filter_data.shape + + if op.type == MaceOp.FullyConnected.name: + weight = self._consts[op.input[1]] + input_shape = list(self._producer[op.input[0]] + .output_shape[0].dims) + weight_shape = [weight.dims[0]] + input_shape[1:] + # OCHW -> OHWC + weight_data = np.array(weight.float_data).reshape( + weight_shape) + weight_data = weight_data.transpose(0, 2, 3, 1) + weight.float_data[:] = weight_data.flat + self.set_filter_format(FilterFormat.HWOI) + + return False + + def buffer_to_image(self, op, input_idx, input_type): + net = self._model + input_name = op.input[input_idx] + op_def = net.op.add() + op_def.name = input_name.replace(':', '_') + "_b2i" + output_name = op_def.name + op_def.type = MaceKeyword.mace_buffer_to_image + op_def.input.extend([input_name]) + op_def.output.extend([output_name]) + + arg = op_def.arg.add() + arg.name = MaceKeyword.mace_buffer_type + arg.i = input_type.value + arg = op_def.arg.add() + arg.name = MaceKeyword.mace_mode + arg.i = 0 + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self._option.data_type + + op.input[input_idx] = output_name + + def transform_buffer_image(self): + if self._option.device != mace_pb2.GPU: + return False + + print("Transform buffer to image") + + net = self._model + for op in net.op: + if op.type == MaceOp.Conv2D.name \ + or op.type == MaceOp.Deconv2D.name: + self.buffer_to_image(op, 1, OpenCLBufferType.CONV2D_FILTER) + if len(op.input) >= 3: + self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) + elif op.type == MaceOp.DepthwiseConv2d.name: + self.buffer_to_image(op, 1, OpenCLBufferType.DW_CONV2D_FILTER) + if len(op.input) >= 3: + self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) + elif op.type == MaceOp.BiasAdd.name: + self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) + elif op.type == MaceOp.FoldedBatchNorm.name: + self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) + self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) + if len(op.input) >= 4: + self.buffer_to_image(op, 3, OpenCLBufferType.ARGUMENT) + elif op.type == MaceOp.MatMul.name and \ + ConverterUtil.get_arg(op, + MaceKeyword.mace_winograd_filter_transformed) is not None: # noqa + self.buffer_to_image(op, 0, OpenCLBufferType.WINOGRAD_FILTER) + elif op.type == MaceOp.WinogradInverseTransform.name \ + and len(op.input) >= 2: + self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) + elif op.type == MaceOp.FullyConnected.name: + self.buffer_to_image(op, 1, OpenCLBufferType.WEIGHT_WIDTH) + if len(op.input) >= 3: + self.buffer_to_image(op, 2, OpenCLBufferType.ARGUMENT) + elif op.type == MaceOp.Activation.name: + if ConverterUtil.get_arg(op, + MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name: # noqa + self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) + + for input_node in self._option.input_nodes.values(): + new_input_name = MaceKeyword.mace_input_node_name \ + + '_' + input_node.name + op_def = self._model.op.add() + + op_def.name = self.normalize_op_name(input_node.name) + op_def.type = MaceKeyword.mace_buffer_to_image + op_def.input.extend([new_input_name]) + op_def.output.extend([input_node.name]) + output_shape = op_def.output_shape.add() + output_shape.dims.extend(input_node.shape) + + arg = op_def.arg.add() + arg.name = MaceKeyword.mace_buffer_type + arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self._option.data_type + + for output_node in self._option.output_nodes.values(): + output_name = MaceKeyword.mace_output_node_name \ + + '_' + output_node.name + op_def = self._model.op.add() + op_def.name = self.normalize_op_name(output_name) + op_def.type = MaceKeyword.mace_image_to_buffer + op_def.input.extend([output_node.name]) + op_def.output.extend([output_name]) + output_shape = op_def.output_shape.add() + output_shape.dims.extend(output_node.shape) + + arg = op_def.arg.add() + arg.name = MaceKeyword.mace_buffer_type + arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self._option.data_type + + return False + + def fold_softmax(self): + changed = False + net = self._model + for op in net.op: + if op.type == MaceOp.Softmax.name: + print("Fold softmax: %s(%s)" % (op.name, op.type)) + if self.consumer_count(op.output[0]) == 1: + consumer = self._consumers[op.output[0]][0] + if consumer.type == MaceOp.Reshape.name: + shape = ConverterUtil.get_arg(consumer, + MaceKeyword.mace_shape_str).ints # noqa + del op.output_shape[0].dims[:] + op.output_shape[0].dims.extend(shape) + self.replace_output_node(consumer) + net.op.remove(consumer) + changed = True + + producer = self._producer[op.input[0]] + if producer.type == MaceOp.Reshape.name: + op.input[0] = producer.input[0] + self.replace_output_node(producer) + net.op.remove(producer) + changed = True + + if len(op.output_shape[0].dims) < 4: + shape = ([1, 1, 1, 1] + list(op.output_shape[0].dims))[-4:] + op.output_shape[0].dims[:] = shape[:] + changed = True + + if changed: + return True + + return False + + def transform_global_conv_to_fc(self): + """Transform global conv to fc should be placed after transposing + input/output and filter""" + if self._option.device == mace_pb2.GPU: + return False + + net = self._model + for op in net.op: + if op.type == MaceOp.Conv2D.name: + producer = self._producer[op.input[0]] + input_shape = producer.output_shape[0].dims + batch, height, width, channels = self.sort_feature_map_shape( + input_shape, ConverterUtil.data_format(producer)) + filter = self._consts[op.input[1]] + filter_shape = filter.dims + filter_height, filter_width, in_channels, out_channels = \ + self.sort_filter_shape(filter_shape, self.filter_format()) + zero_padding = True + padding_arg = ConverterUtil.get_arg(op, + MaceKeyword.mace_padding_str) # noqa + if padding_arg is not None: + if padding_arg.i != PaddingMode.VALID.value: + zero_padding = False + else: + padding_value_arg = ConverterUtil.get_arg(op, + MaceKeyword.mace_padding_values_str) # noqa + if padding_value_arg is not None: + if not all(v == 0 for v in padding_value_arg.ints): + zero_padding = False + + if height == filter_height and width == filter_width \ + and zero_padding: + print("transform global conv to fc %s(%s)" + % (op.name, op.type)) + op.type = MaceOp.FullyConnected.name + filter.dims[:] = [out_channels, + in_channels * filter_width + * filter_height][:] + + def sort_dfs(self, op, visited, sorted_nodes): + visited.update([op.name]) + if len(op.input) > 0: + for input_tensor in op.input: + producer_op = self._producer.get(input_tensor, None) + if producer_op is None: + pass + elif producer_op.name not in visited: + self.sort_dfs(producer_op, visited, sorted_nodes) + sorted_nodes.append(op) + + def sort_by_execution(self): + print("Sort by execution") + net = self._model + visited = set() + sorted_nodes = [] + + for output_node in self._option.output_nodes: + output_tensor = MaceKeyword.mace_output_node_name \ + + '_' + output_node + mace_check(output_tensor in self._producer, + "output_tensor %s not existed in model" % output_tensor) + self.sort_dfs(self._producer[output_tensor], visited, sorted_nodes) + + del net.op[:] + net.op.extend(sorted_nodes) + return False diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py index 38e3a36b..9b0947e0 100644 --- a/mace/python/tools/memory_optimizer.py +++ b/mace/python/tools/memory_optimizer.py @@ -129,7 +129,7 @@ class MemoryOptimizer(object): self.idle_mem.remove(mem_id) if mem_id == -1: - mem_id = self.total_mem_count + mem_id = self.mem_id_base() + self.total_mem_count self.total_mem_count += 1 self.mem_block[mem_id] = op_mem_block @@ -147,10 +147,13 @@ class MemoryOptimizer(object): self.add_net_mem_blocks() - print('total op: %d', len(self.net_def.op)) - print('origin mem: %d, optimized mem: %d', + print("total op: %d" % len(self.net_def.op)) + print("origin mem: %d, optimized mem: %d" % ( self.get_total_origin_mem_size(), - self.get_total_optimized_mem_size()) + self.get_total_optimized_mem_size())) + + def mem_id_base(self): + return 0 class GPUMemoryOptimizer(MemoryOptimizer): @@ -189,6 +192,9 @@ class GPUMemoryOptimizer(MemoryOptimizer): block.x = self.mem_block[mem][0] block.y = self.mem_block[mem][1] + def mem_id_base(self): + return 20000 + def optimize_gpu_memory(net_def): mem_optimizer = GPUMemoryOptimizer(net_def) diff --git a/mace/python/tools/source_converter_lib.py b/mace/python/tools/source_converter_lib.py index 9e57d02b..8b08c11d 100644 --- a/mace/python/tools/source_converter_lib.py +++ b/mace/python/tools/source_converter_lib.py @@ -84,11 +84,20 @@ def obfuscate_name(net_def): op.output[i] = in_out_map[op.output[i]] +def normalize_op_name(op_name): + idx = op_name.rfind(':') + if idx == -1: + return op_name + else: + return op_name[:idx] + + def rename_tensor(net_def): tensor_map = {} for t in net_def.tensors: if t.name not in tensor_map: - tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_") + tensor_map[t.name] = "_" + normalize_op_name(t.name).replace("/", + "_") t.name = tensor_map[t.name] for op in net_def.op: for i in range(len(op.input)): @@ -118,6 +127,8 @@ class TensorInfo: elif t.data_type == mace_pb2.DT_UINT8: self.data = bytearray( np.array(t.int32_data).astype(np.uint8).tolist()) + else: + raise Exception('Tensor data type %s not supported' % t.data_type) def stringfy(value): diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py deleted file mode 100644 index 8647b246..00000000 --- a/mace/python/tools/tf_converter_lib.py +++ /dev/null @@ -1,1522 +0,0 @@ -# Copyright 2018 Xiaomi, Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from mace.proto import mace_pb2 -import tensorflow as tf -import numpy as np -import math -import copy -from tensorflow import gfile -from mace.python.tools import memory_optimizer -from tensorflow.core.framework import graph_pb2 -from tensorflow.core.framework import tensor_shape_pb2 - -padding_mode = {'VALID': 0, 'SAME': 1, 'FULL': 2} -pooling_type_mode = {'AvgPool': 1, 'MaxPool': 2} - -# the order should be the same as -# eltwise type's in mace/kernels/eltwise.h -# and also cwise type's in mace/kernels/cwise.h -# cuz these math ops should have compatible with "EltWise" and "CWise" -math_type_mode = { - 'ADD': 0, - 'SUB': 1, - 'MUL': 2, - 'DIV': 3, - 'MIN': 4, - 'MAX': 5, - 'NEG': 6, - 'ABS': 7, - 'SQR_DIFF': 8, - 'POW': 9, -} - -buffer_type_map = { - 'CONV2D_FILTER': 0, - 'IN_OUT_CHANNEL': 1, - 'ARGUMENT': 2, - 'IN_OUT_HEIGHT': 3, - 'IN_OUT_WIDTH': 4, - 'WINOGRAD_FILTER': 5, - 'DW_CONV2D_FILTER': 6, -} - -data_type_map = {'DT_HALF': mace_pb2.DT_HALF, 'DT_FLOAT': mace_pb2.DT_FLOAT} - -activation_name_map = { - 'Relu': 'RELU', - 'Sigmoid': 'SIGMOID', - 'Tanh': 'TANH', - 'Relu6': 'RELUX' -} - -BATCH_NORM_ORDER = ["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"] - -MACE_INPUT_NODE_NAME = "mace_input_node" -MACE_OUTPUT_NODE_NAME = "mace_output_node" - -OPENCL_IMAGE_MAX_SIZE = 16384 - - -def get_input_tensor(op, index): - input_tensor = op.inputs[index] - if input_tensor.op.type == 'Reshape': - input_tensor = get_input_tensor(input_tensor.op, 0) - return input_tensor - - -class TFConverter(object): - def __init__(self, graph, tf_ops, net_def, dt, device, winograd): - self.graph = graph - self.net_def = net_def - self.tf_ops = tf_ops - self.dt = dt - self.device = device - self.winograd = winograd - self.tf_graph = {} - self.tf_parents = {} - self.resolved_ops = {} - self.unused_tensor = set() - self.transpose_filter_tensor = {} - self.reshape_tensor = {} - self.ops = {} - - for op in tf_ops: - self.ops[op.name] = op - - for op in tf_ops: - self.resolved_ops[op.name] = 0 - for input in op.inputs: - input_name = input.name[:-2] - if input_name not in self.tf_graph: - self.tf_graph[input_name] = [] - self.tf_graph[input_name].append(op) - if op.name not in self.tf_parents: - self.tf_parents[op.name] = [] - self.tf_parents[op.name].append(self.ops[input_name]) - - def add_buffer_to_image(self, input_name, input_type): - output_name = input_name[:-2] + "_b2i" + input_name[-2:] - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'BufferToImage' - op_def.input.extend([input_name]) - op_def.output.extend([output_name]) - - arg = op_def.arg.add() - arg.name = 'buffer_type' - arg.i = buffer_type_map[input_type] - arg = op_def.arg.add() - arg.name = 'mode' - arg.i = 0 - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - return output_name - - def add_image_to_buffer(self, input_name, input_type): - output_name = input_name[:-2] + "_i2b" + input_name[-2:] - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'ImageToBuffer' - op_def.input.extend([input_name]) - op_def.output.extend([output_name]) - - arg = op_def.arg.add() - arg.name = 'buffer_type' - arg.i = buffer_type_map[input_type] - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - return output_name - - def add_gpu_input_transform(self, names): - for name in names: - new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = name - op_def.type = 'BufferToImage' - op_def.input.extend([new_input_name]) - op_def.output.extend([name + ':0']) - - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'buffer_type' - epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] - - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - - self.add_output_shape(self.ops[name].outputs, op_def) - - def add_cpu_input_transform(self, names): - for name in names: - new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = name - op_def.type = 'Transpose' - op_def.input.extend([new_input_name]) - op_def.output.extend([name + ':0']) - - dims_arg = op_def.arg.add() - dims_arg.name = 'dims' - dims_arg.ints.extend([0, 3, 1, 2]) - - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - - self.add_output_shape(self.ops[name].outputs, op_def) - - def add_gpu_output_transform(self, names): - for name in names: - output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'ImageToBuffer' - op_def.input.extend([name + ':0']) - op_def.output.extend([output_name]) - - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'buffer_type' - epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] - - def add_cpu_output_transform(self, names): - for name in names: - output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" - op_def = self.net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'Transpose' - op_def.input.extend([name + ':0']) - op_def.output.extend([output_name]) - - dims_arg = op_def.arg.add() - dims_arg.name = 'dims' - dims_arg.ints.extend([0, 2, 3, 1]) - - output_shapes = [] - for output in self.ops[name].outputs: - old_shape = output.shape.as_list() - # NCHW -> NHWC - if len(old_shape) == 2: - new_shape = [old_shape[0], 1, 1, old_shape[1]] - else: - new_shape = [old_shape[0], old_shape[2], - old_shape[3], old_shape[1]] - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(new_shape) - output_shapes.append(output_shape) - op_def.output_shape.extend(output_shapes) - - def add_output_shape(self, outputs, op): - output_shapes = [] - for output in outputs: - old_shape = [] - if isinstance(output, list): - old_shape = output - elif isinstance(output, tf.Tensor): - if output.shape.num_elements() is not None: - old_shape = output.shape.as_list() - else: - raise ValueError('output type not supported: ', type(output)) - if len(old_shape) == 2: - old_shape = [old_shape[0], old_shape[1], 1, 1] - - if self.device == 'cpu': # NHWC -> NCHW - old_shape = [old_shape[0], old_shape[3], - old_shape[1], old_shape[2]] - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(old_shape) - output_shapes.append(output_shape) - op.output_shape.extend(output_shapes) - - def add_tensor(self, name, shape, tf_dt, value): - tensor = self.net_def.tensors.add() - tensor.name = name - - shape = list(shape) - tensor.dims.extend(shape) - - if tf_dt == tf.float32: - tensor.data_type = mace_pb2.DT_FLOAT - tensor.float_data.extend(value.flat) - elif tf_dt == tf.int32: - tensor.data_type = mace_pb2.DT_INT32 - tensor.int32_data.extend(value.flat) - else: - raise Exception("Not supported tensor type: " + tf_dt.name) - - def convert_reshape(self, op): - input_tensor = get_input_tensor(op, 0) - shape_tensor = get_input_tensor(op, 1) - shape_value = shape_tensor.eval().astype(np.int32) - self.unused_tensor.add(shape_tensor.name) - self.reshape_tensor[input_tensor.name] = shape_value - self.resolved_ops[op.name] = 1 - - def convert_tensor(self, op): - output_name = op.outputs[0].name - if output_name not in self.unused_tensor: - tensor = self.net_def.tensors.add() - tf_tensor = op.outputs[0].eval() - if output_name in self.transpose_filter_tensor: - tf_tensor = tf_tensor.transpose( - self.transpose_filter_tensor[output_name]) - if output_name in self.reshape_tensor: - tf_tensor = tf_tensor.reshape(self.reshape_tensor[output_name]) - tensor.name = op.outputs[0].name - - shape = list(tf_tensor.shape) - tensor.dims.extend(shape) - - tf_dt = op.get_attr('dtype') - if tf_dt == tf.float32: - tensor.data_type = mace_pb2.DT_FLOAT - tensor.float_data.extend(tf_tensor.astype(np.float32).flat) - elif tf_dt == tf.int32: - tensor.data_type = mace_pb2.DT_INT32 - tensor.int32_data.extend(tf_tensor.astype(np.int32).flat) - else: - raise Exception("Not supported tensor type: " + tf_dt.name) - self.resolved_ops[op.name] = 1 - - def check_winograd_conv(self, op): - filter_shape = get_input_tensor(op, 1).shape.as_list() - strides = op.get_attr('strides')[1:3] - output_shape = op.outputs[0].shape.as_list() - if len(output_shape) == 0 or output_shape[0] is None: - return False - width = output_shape[0] * ((output_shape[1] + 1) / 2) * (( - output_shape[2] + 1) / 2) - if self.winograd and op.type != 'DepthwiseConv2dNative' and \ - filter_shape[0] == 3 and \ - (filter_shape[0] == filter_shape[1]) and \ - (strides[0] == 1) and (strides[0] == strides[1]): - if self.device == 'gpu': - return (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \ - (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \ - (width < OPENCL_IMAGE_MAX_SIZE) - elif self.device == 'cpu': - return filter_shape[2] >= 8 and filter_shape[3] >= 8 - return False - - def convert_winograd_conv_gpu(self, op): - filter_tensor = get_input_tensor(op, 1) - filter_shape = filter_tensor.shape.as_list() - output_shape = op.outputs[0].shape.as_list() - - self.transpose_filter_tensor[filter_tensor.name] = (3, 2, 0, 1) - filter_name = self.add_buffer_to_image(op.inputs[1].name, - "WINOGRAD_FILTER") - - # Input transform - wt_op = mace_pb2.OperatorDef() - arg = wt_op.arg.add() - arg.name = 'T' - arg.i = self.dt - padding_arg = wt_op.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode[op.get_attr('padding')] - wt_op.name = op.name + '_input_transform' - wt_op.type = 'WinogradTransform' - wt_op.input.extend([op.inputs[0].name]) - wt_output_name = wt_op.name + ":0" - wt_op.output.extend([wt_output_name]) - wt_output_shape = mace_pb2.OutputShape() - wt_output_width = output_shape[0] * ((output_shape[1] + 1) / 2) * (( - output_shape[2] + 1) / 2) - wt_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1]) - wt_op.output_shape.extend([wt_output_shape]) - - # MatMul - matmul_op = mace_pb2.OperatorDef() - arg = matmul_op.arg.add() - arg.name = 'T' - arg.i = self.dt - matmul_op.name = op.name + '_matmul' - matmul_op.type = 'MatMul' - matmul_op.input.extend([filter_name, wt_output_name]) - matmul_output_name = matmul_op.name + ":0" - matmul_op.output.extend([matmul_output_name]) - matmul_output_shape = mace_pb2.OutputShape() - matmul_output_shape.dims.extend( - [16, filter_shape[3], wt_output_width, 1]) - matmul_op.output_shape.extend([matmul_output_shape]) - - # Inverse transform - iwt_op = mace_pb2.OperatorDef() - arg = iwt_op.arg.add() - arg.name = 'T' - arg.i = self.dt - batch_arg = iwt_op.arg.add() - batch_arg.name = 'batch' - batch_arg.i = output_shape[0] - height_arg = iwt_op.arg.add() - height_arg.name = 'height' - height_arg.i = output_shape[1] - width_arg = iwt_op.arg.add() - width_arg.name = 'width' - width_arg.i = output_shape[2] - iwt_op.name = op.name + '_inverse_transform' - iwt_op.type = 'WinogradInverseTransform' - iwt_op.input.extend([matmul_output_name]) - - final_op = op - self.resolved_ops[op.name] = 1 - - if len(self.tf_graph[op.name] - ) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd': - bias_add_op = self.tf_graph[op.name][0] - output_name = self.add_buffer_to_image( - get_input_tensor(bias_add_op, 1).name, "ARGUMENT") - iwt_op.input.extend([output_name]) - final_op = bias_add_op - self.resolved_ops[bias_add_op.name] = 1 - - if len(self.tf_graph[final_op.name]) == 1 and \ - self.tf_graph[final_op.name][0].type in activation_name_map: - activation_op = self.tf_graph[final_op.name][0] - fused_act_arg = iwt_op.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - if activation_op.type == 'Relu6': - max_limit_arg = iwt_op.arg.add() - max_limit_arg.name = 'max_limit' - max_limit_arg.f = 6 - final_op = activation_op - self.resolved_ops[activation_op.name] = 1 - - iwt_op.output.extend([output.name for output in final_op.outputs]) - self.add_output_shape(final_op.outputs, iwt_op) - self.net_def.op.extend([wt_op, matmul_op, iwt_op]) - - def convert_conv_winograd_filter_cpu(self, op, op_def): - weight_tensor = get_input_tensor(op, 1) - weight_tensor_value = weight_tensor.eval().astype(np.float32) - input_shape = get_input_tensor(op, 0).shape.as_list() - output_channels = weight_tensor_value.shape[3] - input_channels = weight_tensor_value.shape[2] - # HWIO -> OIHW - weight_tensor_value = weight_tensor_value.transpose(3, 2, 0, 1) - if input_shape[1] > 16 and input_shape[2] > 16: - G = np.array([ - [1.0, 0.0, 0.0], - [-2.0 / 9, -2.0 / 9, -2.0 / 9], - [-2.0 / 9, 2.0 / 9, -2.0 / 9], - [1.0 / 90, 1.0 / 45, 2.0 / 45], - [1.0 / 90, -1.0 / 45, 2.0 / 45], - [1.0 / 45, 1.0 / 90, 1.0 / 180], - [1.0 / 45, -1.0 / 90, 1.0 / 180], - [0.0, 0.0, 1.0] - ], dtype=np.float32) - new_shape = [64, output_channels, input_channels] # TOC - else: - G = np.array([ - [1.0, 0.0, 0.0], - [0.5, 0.5, 0.5], - [0.5, -0.5, 0.5], - [0.0, 0.0, 1.0], - ], dtype=np.float32) - new_shape = [16, output_channels, input_channels] # TOC - new_weight_value = G.dot(weight_tensor_value).dot(G.T) # [t, O, I, t] - new_weight_value = new_weight_value.transpose(0, 3, 1, 2) - - new_weight_value = new_weight_value.reshape(new_shape) - new_tensor_name = weight_tensor.name[:-2] + '/winograd_transformed:0' - self.add_tensor(new_tensor_name, new_shape, - tf.float32, new_weight_value) - - winograd_transformed_arg = op_def.arg.add() - winograd_transformed_arg.name = 'is_filter_transformed' - winograd_transformed_arg.i = 1 - - self.unused_tensor.add(weight_tensor.name) - op_def.input.extend([op.inputs[0].name]) - op_def.input.extend([new_tensor_name]) - - def convert_conv2d(self, op): - use_winograd = False - if self.device == 'cpu': - use_winograd = self.check_winograd_conv(op) - - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - if op.type == 'DepthwiseConv2dNative': - op_def.type = 'DepthwiseConv2d' - else: - op_def.type = op.type - - if self.device == 'cpu' and not use_winograd: - self.transpose_filter_tensor[get_input_tensor( - op, 1).name] = (3, 2, 0, 1) - elif op.type == 'Conv2D': - self.transpose_filter_tensor[get_input_tensor( - op, 1).name] = (0, 1, 3, 2) - if self.device == 'gpu': - op_def.input.extend([op.inputs[0].name]) - if op_def.type == 'DepthwiseConv2d': - buffer_type = "DW_CONV2D_FILTER" - else: - buffer_type = "CONV2D_FILTER" - output_name = self.add_buffer_to_image( - get_input_tensor(op, 1).name, buffer_type) - op_def.input.extend([output_name]) - elif self.device == 'cpu' and use_winograd: - self.convert_conv_winograd_filter_cpu(op, op_def) - else: - op_def.input.extend( - [get_input_tensor(op, i).name for i in range(len(op.inputs))]) - - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode[op.get_attr('padding')] - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend(op.get_attr('strides')[1:3]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'cpu': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - final_op = op - self.resolved_ops[op.name] = 1 - - if len(self.tf_graph.get(op.name, [])) == 1 and \ - self.tf_graph[op.name][0].type == 'BiasAdd' or \ - (len(self.tf_graph[op.name]) == 1 and - self.tf_graph[op.name][0].type == 'Add' and - len(self.tf_graph[op.name][0].inputs) == 2 and - len(self.graph.get_tensor_by_name( - self.tf_graph[op.name][0].inputs[1].name).shape) == 1): - bias_add_op = self.tf_graph[op.name][0] - if self.device == 'gpu': - output_name = self.add_buffer_to_image( - get_input_tensor(bias_add_op, 1).name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([get_input_tensor(bias_add_op, 1).name]) - final_op = bias_add_op - self.resolved_ops[bias_add_op.name] = 1 - - if len(self.tf_graph.get(final_op.name, [])) == 1 and \ - self.tf_graph[final_op.name][0].type in activation_name_map: - activation_op = self.tf_graph[final_op.name][0] - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - if activation_op.type == 'Relu6': - max_limit_arg = op_def.arg.add() - max_limit_arg.name = 'max_limit' - max_limit_arg.f = 6 - final_op = activation_op - self.resolved_ops[activation_op.name] = 1 - - op_def.output.extend([output.name for output in final_op.outputs]) - self.add_output_shape(final_op.outputs, op_def) - self.net_def.op.extend([op_def]) - - def convert_deconv2d(self, op): - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = 'Deconv2D' - - out_shape_value = None - if len(op.inputs) == 2: - out_shape_value = op.get_attr('output_shape') - if self.device == 'cpu': - self.transpose_filter_tensor[get_input_tensor( - op, 1).name] = (3, 2, 0, 1) - else: - self.transpose_filter_tensor[get_input_tensor( - op, 1).name] = (0, 1, 3, 2) - if self.device == 'gpu': - op_def.input.extend([op.inputs[0].name]) - buffer_type = "CONV2D_FILTER" - output_name = self.add_buffer_to_image( - get_input_tensor(op, 1).name, buffer_type) - op_def.input.extend([output_name]) - else: - op_def.input.extend( - [get_input_tensor(op, i).name - for i in range(len(op.inputs))]) - elif len(op.inputs) == 3: - out_shape_value = \ - get_input_tensor(op, 0).eval().astype(np.int32).flat - self.unused_tensor.add(op.inputs[0].name) - if self.device == 'cpu': - self.transpose_filter_tensor[get_input_tensor( - op, 1).name] = (2, 3, 0, 1) - else: - self.transpose_filter_tensor[get_input_tensor( - op, 1).name] = (0, 1, 2, 3) - if self.device == 'gpu': - op_def.input.extend([op.inputs[2].name]) - buffer_type = "CONV2D_FILTER" - output_name = self.add_buffer_to_image( - get_input_tensor(op, 1).name, buffer_type) - op_def.input.extend([output_name]) - else: - op_def.input.extend([op.inputs[2].name]) - op_def.input.extend([op.inputs[1].name]) - else: - raise Exception('Too many inputs. Op: %s, type: %s' % (op.name, - op.type)) - if out_shape_value is not None: - out_shape_arg = op_def.arg.add() - out_shape_arg.name = 'output_shape' - out_shape_arg.ints.extend(out_shape_value) - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode[op.get_attr('padding')] - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend(op.get_attr('strides')[1:3]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'cpu': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - final_op = op - self.resolved_ops[op.name] = 1 - - if len(self.tf_graph.get(op.name, [])) == 1 and \ - self.tf_graph[op.name][0].type == 'BiasAdd': - bias_add_op = self.tf_graph[op.name][0] - if self.device == 'gpu': - output_name = self.add_buffer_to_image( - get_input_tensor(bias_add_op, 1).name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([get_input_tensor(bias_add_op, 1).name]) - final_op = bias_add_op - self.resolved_ops[bias_add_op.name] = 1 - - if len(self.tf_graph.get(final_op.name, [])) == 1 and \ - self.tf_graph[final_op.name][0].type in activation_name_map: - activation_op = self.tf_graph[final_op.name][0] - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - if activation_op.type == 'Relu6': - max_limit_arg = op_def.arg.add() - max_limit_arg.name = 'max_limit' - max_limit_arg.f = 6 - final_op = activation_op - self.resolved_ops[activation_op.name] = 1 - - op_def.output.extend([output.name for output in final_op.outputs]) - self.add_output_shape(final_op.outputs, op_def) - self.net_def.op.extend([op_def]) - - def check_conv_to_fc(self, op): - if self.device != 'cpu' or op.type != "Conv2D": - return False - filter_shape = get_input_tensor(op, 1).shape.as_list() - input_shape = get_input_tensor(op, 0).shape.as_list() - return input_shape[1] == filter_shape[0] \ - and input_shape[2] == filter_shape[1] \ - and (op.get_attr('padding') == 'VALID' or filter_shape[0] == 1 - and filter_shape[1] == 1) - - def convert_global_conv_to_fc(self, op): - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = 'FC' - self.transpose_filter_tensor[get_input_tensor(op, 1).name] = \ - (3, 2, 0, 1) - filter_shape = get_input_tensor(op, 1).shape.as_list() - self.reshape_tensor[get_input_tensor(op, 1).name] = \ - [filter_shape[3], - filter_shape[2] * filter_shape[1] * filter_shape[0], 1, 1] - op_def.input.extend( - [get_input_tensor(op, i).name for i in range(len(op.inputs))]) - - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - data_format_arg.s = 'NCHW' - final_op = op - self.resolved_ops[op.name] = 1 - - if len(self.tf_graph.get(op.name, [])) == 1 and \ - self.tf_graph[op.name][0].type == 'BiasAdd': - bias_add_op = self.tf_graph[op.name][0] - if self.device == 'gpu': - output_name = self.add_buffer_to_image( - get_input_tensor(bias_add_op, 1).name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([get_input_tensor(bias_add_op, 1).name]) - final_op = bias_add_op - self.resolved_ops[bias_add_op.name] = 1 - - if len(self.tf_graph.get(final_op.name, [])) == 1 and \ - self.tf_graph[final_op.name][0].type in activation_name_map: - activation_op = self.tf_graph[final_op.name][0] - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - if activation_op.type == 'Relu6': - max_limit_arg = op_def.arg.add() - max_limit_arg.name = 'max_limit' - max_limit_arg.f = 6 - final_op = activation_op - self.resolved_ops[activation_op.name] = 1 - - op_def.output.extend([output.name for output in final_op.outputs]) - self.add_output_shape(final_op.outputs, op_def) - self.net_def.op.extend([op_def]) - - def convert_fused_batchnorm(self, op): - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'cpu': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - op_def.name = op.name - op_def.type = 'FoldedBatchNorm' - - gamma_tensor = get_input_tensor(op, 1) - for i in range(1, 5): - input_tensor = get_input_tensor(op, i) - assert input_tensor.shape == gamma_tensor.shape - self.unused_tensor.add(input_tensor.name) - - gamma_value = get_input_tensor(op, 1).eval().astype(np.float32) - beta_value = get_input_tensor(op, 2).eval().astype(np.float32) - mean_value = get_input_tensor(op, 3).eval().astype(np.float32) - var_value = get_input_tensor(op, 4).eval().astype(np.float32) - epsilon_value = op.get_attr('epsilon') - - scale_value = ((1.0 / np.vectorize(math.sqrt) - (var_value + epsilon_value)) * gamma_value) - offset_value = (-mean_value * scale_value) + beta_value - idx = gamma_tensor.name.rfind('/') - name_prefix = gamma_tensor.name[:idx] + '/' - input_names = [name_prefix + 'scale:0', name_prefix + 'offset:0'] - self.add_tensor(input_names[0], gamma_value.shape, gamma_tensor.dtype, - scale_value) - self.add_tensor(input_names[1], gamma_value.shape, gamma_tensor.dtype, - offset_value) - - op_def.input.extend([op.inputs[0].name]) - if self.device == 'gpu': - for name in input_names: - output_name = self.add_buffer_to_image(name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([name for name in input_names]) - - self.resolved_ops[op.name] = 1 - final_op = op - - if len(self.tf_graph[op.name]) == 1 \ - and self.tf_graph[op.name][0].type in activation_name_map: - activation_op = self.tf_graph[op.name][0] - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - if activation_op.type == 'Relu6': - max_limit_arg = op_def.arg.add() - max_limit_arg.name = 'max_limit' - max_limit_arg.f = 6 - final_op = activation_op - self.resolved_ops[activation_op.name] = 1 - - op_def.output.extend([final_op.outputs[0].name]) - self.add_output_shape([final_op.outputs[0]], op_def) - - self.net_def.op.extend([op_def]) - - def convert_batchnorm(self, op): - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'cpu': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - op_def.name = op.name - op_def.type = 'FoldedBatchNorm' - - add_op = self.tf_graph[op.name][0] - scale_tensor = get_input_tensor(op, 1) - offset_tensor = get_input_tensor(add_op, 1) - input_names = [scale_tensor.name, offset_tensor.name] - - op_def.input.extend([op.inputs[0].name]) - if self.device == 'gpu': - for name in input_names: - output_name = self.add_buffer_to_image(name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([name for name in input_names]) - - self.resolved_ops[op.name] = 1 - self.resolved_ops[add_op.name] = 1 - final_op = add_op - - if len(self.tf_graph[op.name]) == 1 \ - and self.tf_graph[op.name][0].type in activation_name_map: - activation_op = self.tf_graph[op.name][0] - fused_act_arg = op_def.arg.add() - fused_act_arg.name = 'activation' - fused_act_arg.s = activation_name_map[activation_op.type] - if activation_op.type == 'Relu6': - max_limit_arg = op_def.arg.add() - max_limit_arg.name = 'max_limit' - max_limit_arg.f = 6 - final_op = activation_op - self.resolved_ops[activation_op.name] = 1 - - op_def.output.extend([final_op.outputs[0].name]) - self.add_output_shape([final_op.outputs[0]], op_def) - self.net_def.op.extend([op_def]) - - def convert_pooling(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = 'Pooling' - op_def.input.extend([input.name for input in op.inputs]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - pooling_type_arg = op_def.arg.add() - pooling_type_arg.name = 'pooling_type' - pooling_type_arg.i = pooling_type_mode[op.type] - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode[op.get_attr('padding')] - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend(op.get_attr('strides')[1:3]) - kernels_arg = op_def.arg.add() - kernels_arg.name = 'kernels' - kernels_arg.ints.extend(op.get_attr('ksize')[1:3]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'cpu': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - self.resolved_ops[op.name] = 1 - - def convert_global_avg_pooling(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = 'Pooling' - op_def.input.extend([op.inputs[0].name]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - pooling_type_arg = op_def.arg.add() - pooling_type_arg.name = 'pooling_type' - pooling_type_arg.i = pooling_type_mode['AvgPool'] - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode['VALID'] - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend([1, 1]) - kernels_arg = op_def.arg.add() - kernels_arg.name = 'kernels' - kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'cpu': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - self.resolved_ops[op.name] = 1 - - def convert_activation(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = 'Activation' - activation_arg = op_def.arg.add() - activation_arg.name = 'activation' - activation_arg.s = activation_name_map[op.type] - op_def.input.extend([input.name for input in op.inputs]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - - def convert_relu6(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = 'Activation' - op_def.input.extend([input.name for input in op.inputs]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - activation_arg = op_def.arg.add() - activation_arg.name = 'activation' - activation_arg.s = "RELUX" - max_limit_arg = op_def.arg.add() - max_limit_arg.name = 'max_limit' - max_limit_arg.f = 6 - self.resolved_ops[op.name] = 1 - - def convert_add(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = "AddN" - op_def.input.extend([input.name for input in op.inputs]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - - def convert_concat(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = "Concat" - op_def.input.extend([input.name for input in op.inputs[:-1]]) - op_def.output.extend([output.name for output in op.outputs]) - axis_arg = op_def.arg.add() - axis_arg.name = 'axis' - axis = get_input_tensor(op, len(op.inputs) - 1).eval().astype(np.int32) - if self.device == 'cpu' and axis == 3: - axis = 1 - axis_arg.i = axis - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - self.unused_tensor.add(get_input_tensor(op, len(op.inputs) - 1).name) - - def convert_resize_bilinear(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = "ResizeBilinear" - op_def.input.extend([op.inputs[0].name]) - op_def.output.extend([output.name for output in op.outputs]) - size_arg = op_def.arg.add() - size_arg.name = 'size' - size_arg.ints.extend( - get_input_tensor(op, 1).eval().astype(np.int32).flat) - size_arg = op_def.arg.add() - size_arg.name = 'align_corners' - size_arg.i = op.get_attr('align_corners') - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - self.unused_tensor.add(get_input_tensor(op, 1).name) - - def convert_eltwise(self, op, math_type): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = "Eltwise" - if len(op.inputs) == 2: - input_tensor0 = get_input_tensor(op, 0) - input_tensor1 = get_input_tensor(op, 1) - - x_value = None - if np.asarray(input_tensor1.shape).size == 0: - x_value = input_tensor1.eval() - op_def.input.extend([op.inputs[0].name]) - self.unused_tensor.add(input_tensor1.name) - elif np.asarray(input_tensor0.shape).size == 0: - x_value = input_tensor0.eval() - op_def.input.extend([op.inputs[1].name]) - self.unused_tensor.add(input_tensor0.name) - else: - if np.asarray(input_tensor0.shape).size == 1 \ - and input_tensor0.op.type == 'Const': - if self.device == 'gpu': - output_name = self.add_buffer_to_image( - input_tensor0.name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([input_tensor0.name]) - if np.asarray(input_tensor1.shape).size == 1 \ - and input_tensor1.op.type == 'Const': - if self.device == 'gpu': - output_name = self.add_buffer_to_image( - input_tensor1.name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([input_tensor1.name]) - if x_value is not None: - x_arg = op_def.arg.add() - x_arg.name = 'x' - x_arg.f = x_value - else: - op_def.input.extend([input.name for input in op.inputs]) - type_arg = op_def.arg.add() - type_arg.name = 'type' - type_arg.i = math_type_mode[math_type] - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - - def convert_depth_to_space(self, op, d2s): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = op.type - op_def.input.extend([op.inputs[0].name]) - op_def.output.extend([output.name for output in op.outputs]) - size_arg = op_def.arg.add() - size_arg.name = 'block_size' - size_arg.i = op.get_attr('block_size') - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - - def convert_bias_add(self, op): - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = "BiasAdd" - op_def.input.extend([op.inputs[0].name]) - if self.device == 'gpu': - output_name = self.add_buffer_to_image( - get_input_tensor(op, 1).name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([get_input_tensor(op, 1).name]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - self.net_def.op.extend([op_def]) - self.resolved_ops[op.name] = 1 - - def convert_space_to_batch(self, op, b2s): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = op.type - op_def.input.extend([op.inputs[0].name]) - op_def.output.extend([output.name for output in op.outputs]) - size_arg = op_def.arg.add() - size_arg.name = 'block_shape' - size_arg.ints.extend( - get_input_tensor(op, 1).eval().astype(np.int32).flat) - size_arg = op_def.arg.add() - if b2s: - size_arg.name = 'crops' - else: - size_arg.name = 'paddings' - size_arg.ints.extend( - get_input_tensor(op, 2).eval().astype(np.int32).flat) - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - self.unused_tensor.add(get_input_tensor(op, 1).name) - self.unused_tensor.add(get_input_tensor(op, 2).name) - - def is_atrous_conv2d(self, op): - return op.type == 'SpaceToBatchND' and \ - len(self.tf_graph[op.name]) == 1 and \ - (self.tf_graph[op.name][0].type == 'Conv2D' - or self.tf_graph[op.name][0].type == 'DepthwiseConv2dNative') - - def convert_atrous_conv2d(self, op): - op_def = mace_pb2.OperatorDef() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - conv_op = self.tf_graph[op.name][0] - op_def.name = conv_op.name - if conv_op.type == 'DepthwiseConv2dNative': - op_def.type = 'DepthwiseConv2d' - else: - op_def.type = conv_op.type - - if self.device == 'gpu': - op_def.input.extend([op.inputs[0].name]) - if op_def.type == 'DepthwiseConv2d': - buffer_type = "DW_CONV2D_FILTER" - else: - self.transpose_filter_tensor[get_input_tensor( - conv_op, 1).name] = (0, 1, 3, 2) - buffer_type = "CONV2D_FILTER" - output_name = self.add_buffer_to_image( - get_input_tensor(conv_op, 1).name, buffer_type) - op_def.input.extend([output_name]) - else: - self.transpose_filter_tensor[get_input_tensor( - conv_op, 1).name] = (3, 2, 0, 1) - op_def.input.extend([get_input_tensor(op, 0).name]) - op_def.input.extend([get_input_tensor(conv_op, 1).name]) - - dilation_arg = op_def.arg.add() - dilation_arg.name = 'dilations' - dilation_arg.ints.extend( - get_input_tensor(op, 1).eval().astype(np.int32).flat) - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_values = get_input_tensor(op, 2).eval().astype(np.int32).flat - if len(padding_values) > 0 and padding_values[0] > 0: - padding_arg.i = padding_mode['SAME'] - else: - padding_arg.i = padding_mode['VALID'] - self.unused_tensor.add(get_input_tensor(op, 1).name) - self.unused_tensor.add(get_input_tensor(op, 2).name) - - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend([1, 1]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - if self.device == 'cpu': - data_format_arg.s = 'NCHW' - else: - data_format_arg.s = 'NHWC' - final_op = conv_op - self.resolved_ops[op.name] = 1 - self.resolved_ops[conv_op.name] = 1 - - if len(self.tf_graph[final_op.name] - ) == 1 and self.tf_graph[final_op.name][0].type == 'BiasAdd': - bias_add_op = self.tf_graph[final_op.name][0] - if self.device == 'gpu': - output_name = self.add_buffer_to_image( - get_input_tensor(bias_add_op, 1).name, "ARGUMENT") - op_def.input.extend([output_name]) - else: - op_def.input.extend([get_input_tensor(bias_add_op, 1).name]) - final_op = bias_add_op - self.resolved_ops[bias_add_op.name] = 1 - - if len(self.tf_graph[final_op.name]) == 1 and \ - self.tf_graph[final_op.name][0].type == 'BatchToSpaceND': - final_op = self.tf_graph[final_op.name][0] - self.resolved_ops[final_op.name] = 1 - self.unused_tensor.add(get_input_tensor(final_op, 1).name) - self.unused_tensor.add(get_input_tensor(final_op, 2).name) - else: - raise Exception('Convert atrous conv error: no BatchToSpaceND op') - - if len(self.tf_graph[final_op.name]) == 1 and \ - self.tf_graph[final_op.name][0].type == 'Relu': - relu_op = self.tf_graph[final_op.name][0] - fused_relu_arg = op_def.arg.add() - fused_relu_arg.name = 'activation' - fused_relu_arg.s = "RELU" - final_op = relu_op - self.resolved_ops[relu_op.name] = 1 - - op_def.output.extend([output.name for output in final_op.outputs]) - self.add_output_shape(final_op.outputs, op_def) - self.net_def.op.extend([op_def]) - - def is_softmax(self, op): - return op.type == 'Softmax' and \ - len(self.tf_parents[op.name]) == 1 and \ - self.tf_parents[op.name][0].type == 'Reshape' and \ - len(self.tf_graph[op.name]) == 1 and \ - self.tf_graph[op.name][0].type == 'Reshape' - - def convert_softmax(self, softmax_op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - - # deal with first Reshape op - parent_reshape_op = self.tf_parents[softmax_op.name][0] - self.unused_tensor.add(get_input_tensor(parent_reshape_op, 1).name) - self.resolved_ops[parent_reshape_op.name] = 1 - - # FIXME: hardcode for inception_v3 - # remove squeeze if exist - squeeze_op = self.tf_parents[parent_reshape_op.name][0] - if squeeze_op.type == 'Squeeze': - op_def.input.extend([squeeze_op.inputs[0].name]) - self.resolved_ops[squeeze_op.name] = 1 - # remove shape if exist - children_ops = self.tf_graph[squeeze_op.name] - print children_ops - if len(children_ops) > 1 and children_ops[0].type == 'Shape': - self.unused_tensor.add( - get_input_tensor(children_ops[1], 0).name) - self.resolved_ops[children_ops[1].name] = 1 - else: - op_def.input.extend([parent_reshape_op.inputs[0].name]) - - # deal with Softmax op - op_def.name = softmax_op.name - op_def.type = softmax_op.type - self.resolved_ops[softmax_op.name] = 1 - - # deal with last Reshape op - reshape_op = self.tf_graph[softmax_op.name][0] - self.unused_tensor.add(get_input_tensor(reshape_op, 1).name) - - shape = [dim.value for dim in reshape_op.outputs[0].shape] - if len(shape) == 2: - shape = [1, 1, shape[0], shape[1]] - op_def.output.extend([output.name for output in reshape_op.outputs]) - self.add_output_shape([shape], op_def) - self.resolved_ops[reshape_op.name] = 1 - - def convert_pad(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = "Pad" - op_def.input.extend([op.inputs[0].name]) - op_def.output.extend([output.name for output in op.outputs]) - paddings_arg = op_def.arg.add() - paddings_arg.name = 'paddings' - if self.device == 'gpu': - paddings_value = get_input_tensor(op, 1).eval().astype(np.int32) - else: - paddings_value = get_input_tensor(op, 1).eval().astype(np.int32) - paddings_value = paddings_value[[0, 3, 1, 2]] - paddings_arg.ints.extend(paddings_value.flat) - self.unused_tensor.add(get_input_tensor(op, 1).name) - if len(op.inputs) == 3: - constant_value_arg = op_def.arg.add() - constant_value_arg.name = 'constant_value' - constant_value_arg.i = \ - get_input_tensor(op, 2).eval().astype(np.int32).flat[0] - self.unused_tensor.add(get_input_tensor(op, 2).name) - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - - def convert_normal_op(self, op): - op_def = self.net_def.op.add() - arg = op_def.arg.add() - arg.name = 'T' - arg.i = self.dt - op_def.name = op.name - op_def.type = op.type - op_def.input.extend([input.name for input in op.inputs]) - op_def.output.extend([output.name for output in op.outputs]) - self.add_output_shape(op.outputs, op_def) - self.resolved_ops[op.name] = 1 - - def convert(self, input_nodes, output_nodes): - if self.device == 'gpu': - self.add_gpu_input_transform(input_nodes) - if self.device == 'cpu': - self.add_cpu_input_transform(input_nodes) - - for op in self.tf_ops: - if self.resolved_ops[op.name] == 1: - continue - if op.type in ['Placeholder', 'Identity']: - self.resolved_ops[op.name] = 1 - pass - elif op.type == 'Const': - pass - elif op.type == 'Reshape': - self.convert_reshape(op) - elif self.is_atrous_conv2d(op): - self.convert_atrous_conv2d(op) - elif self.check_conv_to_fc(op): - self.convert_global_conv_to_fc(op) - elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative': - if self.device == 'gpu' and self.check_winograd_conv(op): - self.convert_winograd_conv_gpu(op) - else: - self.convert_conv2d(op) - elif op.type == 'Conv2DBackpropInput': - self.convert_deconv2d(op) - elif op.type == 'FusedBatchNorm': - self.convert_fused_batchnorm(op) - elif op.type == 'Mul' and op.name.find('batchnorm/mul') != -1: - self.convert_batchnorm(op) - elif op.type == 'AvgPool' or op.type == 'MaxPool': - self.convert_pooling(op) - elif op.type == 'Relu6': - self.convert_relu6(op) - elif op.type == 'Add': - if len(op.inputs) > 2: - self.convert_add(op) - else: - self.convert_eltwise(op, 'ADD') - elif op.type == 'ConcatV2': - self.convert_concat(op) - elif op.type == 'ResizeBilinear': - self.convert_resize_bilinear(op) - elif op.type == 'BiasAdd': - self.convert_bias_add(op) - elif op.type == 'SpaceToBatchND': - self.convert_space_to_batch(op, False) - elif op.type == 'BatchToSpaceND': - self.convert_space_to_batch(op, True) - elif op.type == 'DepthToSpace': - self.convert_depth_to_space(op, True) - elif op.type == 'SpaceToDepth': - self.convert_depth_to_space(op, False) - elif op.type in ['Neg', 'neg', 'Negative', 'negative']: - self.convert_eltwise(op, 'NEG') - elif op.type in ['RealDiv', 'Div']: - self.convert_eltwise(op, 'DIV') - elif op.type in ['SquaredDifference']: - self.convert_eltwise(op, 'SQR_DIFF') - elif op.type in ['Pow']: - self.convert_eltwise(op, 'POW') - elif op.type == 'Mul': - self.convert_eltwise(op, 'MUL') - elif op.type == 'Sub': - self.convert_eltwise(op, 'SUB') - elif self.is_softmax(op): - self.convert_softmax(op) - elif op.type in ['Relu', 'Sigmoid', 'Tanh']: - self.convert_activation(op) - # FIXME: hardcode for inception_v3 - elif op.type in ['Squeeze', 'Shape']: - self.resolved_ops[op.name] = 1 - elif op.type == 'Mean': - # Global avg pooling - reduce_dims = op.inputs[1].eval() - if reduce_dims[0] == 1 and reduce_dims[1] == 2: - self.convert_global_avg_pooling(op) - self.unused_tensor.add(op.inputs[1].name) - else: - raise Exception('Unknown Op: %s, type: %s' % (op.name, - op.type)) - elif op.type == 'Pad': - self.convert_pad(op) - # elif op.type in ['']: - # self.convert_normal_op(op) - else: - raise Exception('Unknown Op: %s, type: %s' % (op.name, - op.type)) - - for op in self.tf_ops: - if self.resolved_ops[op.name] == 1: - continue - elif op.type == 'Const': - self.convert_tensor(op) - else: - raise Exception('Unknown Op: %s, type: %s' % (op.name, - op.type)) - - if self.device == 'gpu': - self.add_gpu_output_transform(output_nodes) - - if self.device == 'cpu': - self.add_cpu_output_transform(output_nodes) - - for key in self.resolved_ops: - if self.resolved_ops[key] != 1: - print 'Unresolve Op: %s' % key - - -class Optimizer: - def __init__(self, net_def, device): - self.net_def = net_def - self.device = device - self.mace_graph = {} - self.tensor_map = {} - for op in net_def.op: - for input_name in op.input: - if input_name not in self.mace_graph: - self.mace_graph[input_name] = [] - self.mace_graph[input_name].append(op) - - for tensor in net_def.tensors: - self.tensor_map[tensor.name] = tensor - - def get_buffer_tensor_name(self, name): - if self.device == 'gpu': - return name[:-6] + name[-2:] - else: - return name - - def fold_batch_norm(self): - unused_tensors = set() - new_tensors = [] - new_net = mace_pb2.NetDef() - resolved_ops = set() - - for op in self.net_def.op: - if op.name in resolved_ops: - pass - elif op.type == 'DepthwiseConv2d' and len(op.output) == 1 and \ - self.mace_graph[op.output[0]][0].type == 'FoldedBatchNorm': - depthwise_conv2d_op = op - folded_bn_op = self.mace_graph[op.output[0]][0] - weight_buffer_name = self.get_buffer_tensor_name( - depthwise_conv2d_op.input[1]) - weight_tensor = self.tensor_map[weight_buffer_name] - scale_buffer_name = self.get_buffer_tensor_name( - folded_bn_op.input[1]) - offset_buffer_name = self.get_buffer_tensor_name( - folded_bn_op.input[2]) - scale_tensor = self.tensor_map[scale_buffer_name] - weight_shape = weight_tensor.dims - idx = 0 - if self.device == 'cpu': # OIHW - for oc in range(weight_shape[0]): - for ic in range(weight_shape[1]): - for i in range(weight_shape[2]): - for j in range(weight_shape[3]): - weight_tensor.float_data[ - idx] *= scale_tensor.float_data[ - ic * weight_shape[0] + oc] - idx += 1 - else: # HWIO - for i in range(weight_shape[0]): - for j in range(weight_shape[1]): - for ic in range(weight_shape[2]): - for oc in range(weight_shape[3]): - weight_tensor.float_data[ - idx] *= scale_tensor.float_data[ - ic * weight_shape[3] + oc] - idx += 1 - - new_tensors.append(weight_tensor) - unused_tensors.add(weight_tensor.name) - unused_tensors.add(scale_tensor.name) - - if self.device == 'gpu': - scale_b2i_op = self.mace_graph[scale_buffer_name][0] - offset_b2i_op = self.mace_graph[offset_buffer_name][0] - resolved_ops.add(scale_b2i_op.name) - resolved_ops.add(offset_b2i_op.name) - new_net.op.extend([offset_b2i_op]) - - resolved_ops.add(depthwise_conv2d_op.name) - resolved_ops.add(folded_bn_op.name) - - offset_tensor_name = folded_bn_op.input[2] - depthwise_conv2d_op.input.extend([offset_tensor_name]) - - for arg in folded_bn_op.arg: - if arg.name == 'activation': - act_arg = depthwise_conv2d_op.arg.add() - act_arg.name = arg.name - act_arg.s = arg.s - elif arg.name == 'max_limit': - act_arg = depthwise_conv2d_op.arg.add() - act_arg.name = arg.name - act_arg.f = arg.f - - depthwise_conv2d_op.output[0] = folded_bn_op.output[0] - new_net.op.extend([depthwise_conv2d_op]) - else: - new_net.op.extend([op]) - - for tensor in self.net_def.tensors: - if tensor.name in unused_tensors: - pass - else: - new_net.tensors.extend([tensor]) - - for tensor in new_tensors: - new_net.tensors.extend([tensor]) - - return new_net - - def optimize(self): - new_net = self.fold_batch_norm() - return new_net - - -def add_shape_info(input_graph_def, input_nodes, input_shapes): - inputs_replaced_graph = graph_pb2.GraphDef() - for node in input_graph_def.node: - if node.name in input_nodes: - idx = input_nodes.index(node.name) - input_shape = input_shapes[idx] - placeholder_node = copy.deepcopy(node) - placeholder_node.attr.clear() - placeholder_node.attr['shape'].shape.dim.extend([ - tensor_shape_pb2.TensorShapeProto.Dim(size=i) - for i in input_shape - ]) - placeholder_node.attr['dtype'].CopyFrom(node.attr['dtype']) - inputs_replaced_graph.node.extend([placeholder_node]) - else: - inputs_replaced_graph.node.extend([copy.deepcopy(node)]) - return inputs_replaced_graph - - -def convert_to_mace_pb(model_file, input_node, input_shape, output_node, - data_type, device, winograd): - net_def = mace_pb2.NetDef() - dt = data_type_map[data_type] - - input_graph_def = tf.GraphDef() - with gfile.Open(model_file, "rb") as f: - data = f.read() - input_graph_def.ParseFromString(data) - - input_nodes = [x for x in input_node.split(',')] - input_shapes = [] - if input_shape != "": - input_shape_strs = [x for x in input_shape.split(':')] - for shape_str in input_shape_strs: - input_shapes.extend([[int(x) for x in shape_str.split(',')]]) - output_nodes = [x for x in output_node.split(',')] - assert len(input_nodes) == len(input_shapes) - - input_graph_def = add_shape_info(input_graph_def, input_nodes, - input_shapes) - with tf.Session() as session: - with session.graph.as_default() as graph: - tf.import_graph_def(input_graph_def, name="") - ops = graph.get_operations() - converter = TFConverter(graph, ops, net_def, dt, device, winograd) - converter.convert(input_nodes, output_nodes) - optimizer = Optimizer(net_def, device) - net_def = optimizer.optimize() - print "Model Converted." - if device == 'gpu': - print "start optimize memory." - memory_optimizer.optimize_gpu_memory(net_def) - print "Memory optimization done." - elif device == 'cpu': - print "start optimize memory." - memory_optimizer.optimize_cpu_memory(net_def) - print "Memory optimization done." - - return net_def diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index 5510afc2..4bd33add 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -152,7 +152,7 @@ void CheckOutputs(const NetDef &net_def, memcpy(input_data.data(), input.second.data().get(), data_size * sizeof(float)); std::string input_name = MakeString("mace_input_node_", - input.first, ":0"); + input.first); net.AddInputFromArray(input_name, input.second.shape(), input_data); } @@ -181,7 +181,7 @@ void CheckOutputs(const NetDef &net_def, float *data = tmp_tensor->mutable_data(); memcpy(data, output.second.data().get(), data_size * sizeof(float)); std::string output_name = MakeString("mace_output_node_", - output.first, ":0"); + output.first); ops::test::ExpectTensorNear(*tmp_tensor, *net.GetOutput(output_name.data()), 1e-5); @@ -265,7 +265,7 @@ void MaceRunFunc(const int in_out_size) { for (size_t i = 0; i < input_names.size(); ++i) { std::string input_name = MakeString("mace_input_node_", - input_names[i], ":0"); + input_names[i]); BufferToImage(input_name, input_names[i], mace::kernels::IN_OUT_CHANNEL, {mem_map[input_names[i]]}, @@ -281,7 +281,7 @@ void MaceRunFunc(const int in_out_size) { } for (size_t i = 0; i < output_names.size(); ++i) { std::string output_name = MakeString("mace_output_node_", - output_names[i], ":0"); + output_names[i]); ImageToBuffer(output_names[i], output_name, mace::kernels::IN_OUT_CHANNEL, &net_def); } diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index 73775520..be086426 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -162,7 +162,7 @@ void CheckOutputs(const NetDef &net_def, memcpy(input_data.data(), input.second.data().get(), data_size * sizeof(float)); std::string input_name = MakeString("mace_input_node_", - input.first, ":0"); + input.first); net.AddInputFromArray(input_name, input.second.shape(), input_data); } @@ -191,7 +191,7 @@ void CheckOutputs(const NetDef &net_def, float *data = tmp_tensor->mutable_data(); memcpy(data, output.second.data().get(), data_size * sizeof(float)); std::string output_name = MakeString("mace_output_node_", - output.first, ":0"); + output.first); ops::test::ExpectTensorNear(*tmp_tensor, *net.GetOutput(output_name.data()), 1e-5); @@ -275,7 +275,7 @@ void MaceRun(const int in_out_size, for (size_t i = 0; i < input_names.size(); ++i) { std::string input_name = MakeString("mace_input_node_", - input_names[i], ":0"); + input_names[i]); BufferToImage(input_name, input_names[i], mace::kernels::IN_OUT_CHANNEL, {mem_map[input_names[i]]}, @@ -291,7 +291,7 @@ void MaceRun(const int in_out_size, } for (size_t i = 0; i < output_names.size(); ++i) { std::string output_name = MakeString("mace_output_node_", - output_names[i], ":0"); + output_names[i]); ImageToBuffer(output_names[i], output_name, mace::kernels::IN_OUT_CHANNEL, &net_def); } -- GitLab