move distribute tools from mace

0e5ebc1f · yejianwu · 0e5ebc1f · 0e5ebc1f · 0e5ebc1f · 0e5ebc1f
20 changed file
--- a/hexagon/BUILD
+++ b/hexagon/BUILD
+cc_library(
+    name = "hexagon",
+    srcs = [
+        "libhexagon_controller.so",
+    ],
+    visibility = ["//visibility:public"],
+)
--- a/hexagon/libhexagon_controller.so
+++ b/hexagon/libhexagon_controller.so
--- a/libmace_v7/README.md
+++ b/libmace_v7/README.md
+# Mace static libraries for GPU
--- a/libmace_v7_dsp/README.md
+++ b/libmace_v7_dsp/README.md
+# Mace static libraries for GPU and DSP
--- a/mace
+++ b/mace
+libmace_v7_dsp
\ No newline at end of file
--- a/proto/BUILD
+++ b/proto/BUILD
+# Description:
+# mace proto.
+#
+package(
+    default_visibility = ["//visibility:public"],
+)
+licenses(["notice"])  # Apache 2.0
+load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
+py_proto_library(
+    name = "mace_py",
+    srcs = ["mace.proto"],
+    default_runtime = "@com_google_protobuf//:protobuf_python",
+    protoc = "@com_google_protobuf//:protoc",
+    srcs_version = "PY2AND3",
+    deps = ["@com_google_protobuf//:protobuf_python"],
+)
--- a/proto/mace.proto
+++ b/proto/mace.proto
+syntax = "proto2";
+package mace;
+enum NetMode {
+  INIT   = 0;
+  NORMAL = 1;
+}
+enum DeviceType {
+  CPU    = 0;                    // In default, we will use CPU.
+  NEON   = 1;
+  OPENCL = 2;
+}
+enum DataType {
+  DT_INVALID = 0;
+  // Data types that all computation devices are expected to be
+  // capable to support.
+  DT_FLOAT = 1;
+  DT_DOUBLE = 2;
+  DT_INT32 = 3;
+  DT_UINT8 = 4;
+  DT_INT16 = 5;
+  DT_INT8 = 6;
+  DT_STRING = 7;
+  DT_INT64 = 8;
+  DT_UINT16 = 9;
+  DT_BOOL = 10;
+  DT_HALF = 19;
+  DT_UINT32 = 22;
+}
+message TensorProto {
+  // The dimensions in the tensor.
+  repeated int64 dims = 1;
+  optional DataType data_type = 2 [default = DT_FLOAT];
+  // For float
+  repeated float float_data = 3 [packed = true];
+  // For int32, uint8, int8, uint16, int16, bool, and float16
+  // Note about float16: in storage we will basically convert float16 byte-wise
+  // to unsigned short and then store them in the int32_data field.
+  repeated int32 int32_data = 4 [packed = true];
+  // For bytes
+  optional bytes byte_data = 5;
+  // For strings
+  repeated bytes string_data = 6;
+  // For double
+  repeated double double_data = 9 [packed = true];
+  // For int64
+  repeated int64 int64_data = 10 [packed = true];
+  // Optionally, a name for the tensor.
+  optional string name = 7;
+  optional uint32 node_id = 100;
+}
+message Argument {
+  optional string name = 1;
+  optional float f = 2;
+  optional int64 i = 3;
+  optional bytes s = 4;
+  repeated float floats = 5;
+  repeated int64 ints = 6;
+  repeated bytes strings = 7;
+}
+// for hexagon mace-nnlib
+message NodeInput {
+  optional int32 node_id = 1;
+  optional int32 output_port = 2;
+}
+message OutputShape {
+  repeated int64 dims = 1;
+}
+message OperatorDef {
+  repeated string input = 1;
+  repeated string output = 2;
+  optional string name = 3;
+  optional string type = 4;
+  repeated Argument arg = 5;
+  repeated OutputShape output_shape = 6;
+  repeated DataType output_type = 7;
+  // Memory optimization: only support one single output op
+  optional int32 mem_id = 10 [default = -1];
+  // for hexagon mace-nnlib
+  optional uint32 node_id = 100;
+  optional uint32 op_id = 101;
+  optional uint32 padding = 102;
+  repeated NodeInput node_input = 103;
+  repeated int32 out_max_byte_size = 104; // only support 32-bit len
+}
+// for memory optimization
+message MemoryBlock {
+  optional int32 mem_id = 1;
+  optional uint32 x = 2;
+  optional uint32 y = 3;
+}
+message MemoryArena {
+  repeated MemoryBlock mem_block = 1;
+}
+// for hexagon mace-nnlib
+message InputInfo {
+  optional string name = 1;
+  optional int32 node_id = 2;
+  repeated int32 dims = 3;
+  optional int32 max_byte_size = 4; // only support 32-bit len
+  optional DataType data_type = 5 [default = DT_FLOAT];
+}
+message OutputInfo {
+  optional string name = 1;
+  optional int32 node_id = 2;
+  repeated int32 dims = 3;
+  optional int32 max_byte_size = 4; // only support 32-bit len
+  optional DataType data_type = 5 [default = DT_FLOAT];
+}
+message NetDef {
+  optional string name = 1;
+  repeated OperatorDef op = 2;
+  optional string version = 3;
+  repeated Argument arg = 4;
+  repeated TensorProto tensors = 5;
+  // for mem optimization
+  optional MemoryArena mem_arena = 10;
+  // for hexagon mace-nnlib
+  repeated InputInfo input_info = 100;
+  repeated OutputInfo output_info = 101;
+}
--- a/python/tools/BUILD
+++ b/python/tools/BUILD
+py_library(
+    name = "tf_converter_lib",
+    srcs = [
+        "convert_util.py",
+        "graph_util.py",
+        "tf_converter_lib.py",
+        "tf_dsp_converter_lib.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":memory_optimizer",
+        "//lib/proto:mace_py",
+    ],
+)
+py_library(
+    name = "source_converter_lib",
+    srcs = [
+        "source_converter_lib.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//lib/proto:mace_py",
+    ],
+)
+py_binary(
+    name = "tf_converter",
+    srcs = ["tf_converter.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tf_converter_lib",
+        ":source_converter_lib",
+        "@six_archive//:six",
+    ],
+)
+py_binary(
+    name = "memory_optimizer",
+    srcs = ["memory_optimizer.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//lib/proto:mace_py",
+    ],
+)
--- a/python/tools/binary_codegen.py
+++ b/python/tools/binary_codegen.py
+import argparse
+import os
+import sys
+import struct
+import jinja2
+import numpy as np
+# python mace/python/tools/binary_codegen.py \
+#     --binary_file=${BIN_FILE} --output_path=${CODE_GEN_PATH} --variable_name=kTuningParamsData
+FLAGS = None
+def generate_cpp_source():
+  data_map = {}
+  if not os.path.exists(FLAGS.binary_file):
+    env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
+    return env.get_template('str2vec_maps.cc.tmpl').render(
+      maps=data_map,
+      data_type='unsigned int',
+      variable_name=FLAGS.variable_name
+    )
+  with open(FLAGS.binary_file, "rb") as binary_file:
+    binary_array = np.fromfile(binary_file, dtype=np.uint8)
+  idx = 0
+  size, = struct.unpack("Q", binary_array[idx:idx+8])
+  print size
+  idx += 8
+  for _ in xrange(size):
+    key_size, = struct.unpack("i", binary_array[idx:idx+4])
+    idx += 4
+    key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size])
+    idx += key_size
+    params_size, = struct.unpack("i", binary_array[idx:idx+4])
+    idx += 4
+    data_map[key] = []
+    count = params_size / 4
+    params = struct.unpack(str(count) + "i", binary_array[idx:idx+params_size])
+    for i in params:
+      data_map[key].append(i)
+    idx += params_size
+  env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
+  return env.get_template('str2vec_maps.cc.tmpl').render(
+    maps = data_map,
+    data_type = 'unsigned int',
+    variable_name = FLAGS.variable_name
+  )
+def main(unused_args):
+  cpp_binary_source = generate_cpp_source()
+  if os.path.isfile(FLAGS.output_path):
+    os.remove(FLAGS.output_path)
+  w_file = open(FLAGS.output_path, "w")
+  w_file.write(cpp_binary_source)
+  w_file.close()
+def parse_args():
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--binary_file",
+      type=str,
+      default="",
+      help="The binaries file path.")
+  parser.add_argument(
+      "--output_path",
+      type=str,
+      default="",
+      help="The path of generated C++ source file which contains the binary.")
+  parser.add_argument(
+    "--variable_name",
+    type=str,
+    default="kTuningParamsData",
+    help="global variable name.")
+  return parser.parse_known_args()
+if __name__ == '__main__':
+  FLAGS, unparsed = parse_args()
+  main(unused_args=[sys.argv[0]] + unparsed)
--- a/python/tools/convert_util.py
+++ b/python/tools/convert_util.py
+import tensorflow as tf
+from lib.proto import mace_pb2
+TF_DTYPE_2_MACE_DTYPE_MAP = {
+    tf.float32: mace_pb2.DT_FLOAT,
+    tf.double: mace_pb2.DT_DOUBLE,
+    tf.half: mace_pb2.DT_HALF,
+    tf.int64: mace_pb2.DT_INT64,
+    tf.int32: mace_pb2.DT_INT32,
+    tf.qint32: mace_pb2.DT_INT32,
+    tf.int16: mace_pb2.DT_INT16,
+    tf.qint16: mace_pb2.DT_INT16,
+    tf.int8: mace_pb2.DT_INT8,
+    tf.qint8: mace_pb2.DT_INT8,
+    tf.quint16: mace_pb2.DT_UINT16,
+    tf.uint16: mace_pb2.DT_UINT16,
+    tf.quint8: mace_pb2.DT_UINT8,
+    tf.uint8: mace_pb2.DT_UINT8,
+    tf.string: mace_pb2.DT_STRING,
+    tf.bool: mace_pb2.DT_BOOL,
+}
+def tf_dtype_2_mace_dtype(tf_dtype):
+    mace_dtype = TF_DTYPE_2_MACE_DTYPE_MAP.get(tf_dtype, None)
+    if not mace_dtype:
+        raise Exception("Not supported tensorflow dtype: " + tf_dtype)
+    return mace_dtype
--- a/python/tools/dsp_ops.py
+++ b/python/tools/dsp_ops.py
+class DspOps(object):
+  def __init__(self):
+    self.dsp_ops = {
+      'INPUT': 'INPUT"',
+      'OUTPUT': 'OUTPUT',
+      'NoOp': 'Nop',
+      'FLATTEN': 'Flatten',
+      'Identity': 'Nop',
+      'Placeholder': 'INPUT',
+      'Const': 'Const',
+      'QuantizedConv2D': 'QuantizedConv2d_8x8to32',
+      'QuantizedMatMul': 'QuantizedMatMul_8x8to32',
+      'QuantizeDownAndShrinkRange': 'QuantizeDownAndShrinkRange_32to8',
+      'QuantizedRelu': 'QuantizedRelu_8',
+      'QuantizedReluX': 'QuantizedReluX_8',
+      'QuantizedMaxPool': 'QuantizedMaxPool_8',
+      'QuantizedAvgPool': 'QuantizedAvgPool_8',
+      'QuantizedConcat': 'QuantizedConcat_8',
+      'QuantizedBiasAdd': 'QuantizedBiasAdd_8p8to32',
+      'QuantizedResizeBilinear' : 'QuantizedResizeBilinear_8',
+      'QuantizedSpaceToBatchND': 'QuantizedSpaceToBatchND_8',
+      'QuantizedBatchToSpaceND': 'QuantizedBatchToSpaceND_8',
+      'Min': 'Min_f',
+      'Max': 'Max_f',
+      'QuantizeV2': 'Quantize',
+      'Dequantize': 'Dequantize',
+      'Softmax': 'Softmax_f',
+      'Reshape': 'Reshape',
+      'QuantizedReshape': 'QuantizedReshape',
+      'Sigmoid': 'Sigmoid_f',
+      'Slice': 'Slice_f',
+      'Add': 'Add_f',
+      'Mul': 'Mul_f',
+      'Requantize': 'Requantize_32to8',
+      'RequantizationRange': 'RequantizationRange_32',
+      'Sub': 'Sub_f',
+      'Pack': 'Pack_int32',
+      'StridedSlice': 'StridedSlice_f',
+      'ExpandDims': 'ExpandDims_f',
+      'QuantizedMul': 'QuantizedMul_8x8to32',
+      'QuantizedAdd': 'QuantizedAdd_8p8to32',
+      'Pad': 'Pad_f',
+      'SpaceToBatchND': 'SpaceToBatchND_f',
+      'BatchToSpaceND': 'BatchToSpaceND_f',
+      'ResizeBilinear': 'ResizeBilinear_f',
+      'ConcatV2': 'ConcatV2_f',
+      'Conv2DBackpropInput': 'Deconv_f',
+      'Tanh': 'Tanh_f',
+      'Split': 'Split_f',
+      'Transpose': 'Transpose_f',
+      'Concat': 'Concat_f',
+      'AddN': 'AddN_f',
+    }
+  def has_op(self, tf_op):
+    return tf_op in self.dsp_ops
+  def map_nn_op(self, tf_op):
+    if tf_op not in self.dsp_ops:
+      raise Exception('Could not map nn op for: ', tf_op)
+    return self.dsp_ops[tf_op]
--- a/python/tools/graph_util.py
+++ b/python/tools/graph_util.py
+import tensorflow as tf
+from lib.proto import mace_pb2
+from collections import OrderedDict
+def sort_tf_node(node, nodes_map, ordered_nodes_map):
+    if node.name not in ordered_nodes_map:
+        for input_tensor_name in node.input:
+            input_node_name = input_tensor_name.split(':')[
+                0] if ':' in input_tensor_name else input_tensor_name
+            if input_node_name not in nodes_map or input_node_name in ordered_nodes_map:
+                continue
+            input_node = nodes_map[input_node_name]
+            sort_tf_node(input_node, nodes_map, ordered_nodes_map)
+        ordered_nodes_map[node.name] = node
+def sort_tf_graph(graph_def):
+    nodes_map = {}
+    ordered_nodes_map = OrderedDict()
+    for node in graph_def.node:
+        nodes_map[node.name] = node
+    for node in graph_def.node:
+        sort_tf_node(node, nodes_map, ordered_nodes_map)
+    sorted_graph = tf.GraphDef()
+    sorted_graph.node.extend([node for node in ordered_nodes_map.values()])
+    return sorted_graph
+def sort_mace_node(node, nodes_map, ordered_nodes_map):
+    if node.name not in ordered_nodes_map:
+        for input_tensor_name in node.input:
+            input_node_name = input_tensor_name.split(':')[
+                0] if ':' in input_tensor_name else input_tensor_name
+            if input_node_name not in nodes_map or input_node_name in ordered_nodes_map:
+                continue
+            input_node = nodes_map[input_node_name]
+            sort_mace_node(input_node, nodes_map, ordered_nodes_map)
+        ordered_nodes_map[node.name] = node
+def sort_mace_graph(graph_def, output_name):
+    nodes_map = {}
+    ordered_nodes_map = OrderedDict()
+    for node in graph_def.op:
+        nodes_map[node.name] = node
+    sort_mace_node(nodes_map[output_name], nodes_map, ordered_nodes_map)
+    sorted_graph = mace_pb2.NetDef()
+    sorted_graph.tensors.extend(graph_def.tensors)
+    sorted_graph.op.extend([node for node in ordered_nodes_map.values()])
+    return sorted_graph
--- a/python/tools/memory_optimizer.py
+++ b/python/tools/memory_optimizer.py
+import sys
+import operator
+from lib.proto import mace_pb2
+class MemoryOptimizer(object):
+  def __init__(self, net_def):
+    self.net_def = net_def
+    self.idle_mem = set()
+    self.op_mem = {}    # op_name->mem_id
+    self.mem_block = {} # mem_id->[x, y]
+    self.total_mem_count = 0
+    self.ref_counter = {}
+    consumers = {}
+    for op in net_def.op:
+      if self.is_buffer_image_op(op):
+        continue
+      for ipt in op.input:
+        if ipt not in consumers:
+          consumers[ipt] = []
+        consumers[ipt].append(op)
+    # only ref op's output tensor
+    for op in net_def.op:
+      if self.is_buffer_image_op(op):
+        continue
+      tensor_name = op.output[0]
+      if tensor_name in consumers:
+        self.ref_counter[tensor_name] = len(consumers[tensor_name])
+      else:
+        self.ref_counter[tensor_name] = 0
+  def is_buffer_image_op(self, op):
+    return op.type == 'BufferToImage' or op.type == 'ImageToBuffer'
+  def optimize(self):
+    for op in self.net_def.op:
+      if self.is_buffer_image_op(op):
+        continue
+      if len(self.idle_mem) == 0:
+        # allocate new mem
+        mem_id = self.total_mem_count
+        self.total_mem_count += 1
+      else:
+        # reuse mem
+        mem_id = self.idle_mem.pop()
+      if not op.output_shape:
+        print('WARNING: There is no output shape information to do memory optimization.')
+        return
+      op.mem_id = mem_id
+      self.op_mem[op.output[0]] = mem_id
+      if mem_id not in self.mem_block:
+        self.mem_block[mem_id] = [0, 0]
+      mem_size = self.mem_block[mem_id]
+      mem_size[1] = max(mem_size[1], op.output_shape[0].dims[0] * op.output_shape[0].dims[1])
+      mem_size[0] = max(mem_size[0], op.output_shape[0].dims[2] * int((op.output_shape[0].dims[3]+3)/4))
+      # de-ref input tensor mem
+      for ipt in op.input:
+        if ipt in self.ref_counter:
+          self.ref_counter[ipt] -= 1
+          if self.ref_counter[ipt] == 0:
+            self.idle_mem.add(self.op_mem[ipt])
+          elif self.ref_counter[ipt] < 0:
+            raise Exception('ref count is less than 0')
+    for mem in self.mem_block:
+      arena = self.net_def.mem_arena
+      block = arena.mem_block.add()
+      block.mem_id = mem
+      block.x = self.mem_block[mem][0]
+      block.y = self.mem_block[mem][1]
+    print('total op: %d', len(self.net_def.op))
+    origin_mem_size = 0
+    optimized_mem_size = 0
+    for op in self.net_def.op:
+      if self.is_buffer_image_op(op):
+        continue
+      origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1)
+    for mem in self.mem_block:
+      print mem, self.mem_block[mem]
+      optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4)
+    print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)
+def optimize_memory(net_def):
+  mem_optimizer = MemoryOptimizer(net_def)
+  mem_optimizer.optimize()
--- a/python/tools/model.template
+++ b/python/tools/model.template
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+// Generated by the mace converter.  DO NOT EDIT!
+//
+{% if mode == 0 %}
+#include <vector>
+#include "mace/core/public/mace.h"
+namespace mace {
+namespace {{tag}} {
+{% if tensor_info.data_type != 'DT_UINT8' %} alignas(4) {% endif %} unsigned char {{ tensor_info.name }}[] = {
+{% for d in tensor_info.data %}{{"0x%02X, " % d }}{%endfor%}
+};
+void Create{{tensor.name}}(std::vector<mace::ConstTensor> &tensors) {
+  tensors.emplace_back(mace::ConstTensor(
+      {{ tensor.name|tojson }}, {{ tensor.name }},
+      { {{ tensor.dims|join(', ') }} }, {{ tensor.data_type }}, {{ tensor.node_id }}));
+}
+}  // namespace {{tag}}
+}  // namespace mace
+{% elif mode == 1 %}
+#include <vector>
+#include <string>
+#include "mace/core/public/mace.h"
+namespace {
+void UpdateOp(mace::OperatorDef &op,
+              const std::string &name,
+              const std::string &type,
+              const std::vector<std::string> &inputs,
+              const std::vector<std::string> &outputs,
+              const std::vector<mace::DataType> &output_types,
+              uint32_t node_id) {
+  op.set_name(name);
+  op.set_type(type);
+  op.set_input(inputs);
+  op.set_output(outputs);
+  op.set_output_type(output_types);
+  op.set_node_id(node_id);
+}
+}
+namespace mace {
+namespace {{tag}} {
+{% for i in range(start, end) %}
+void CreateOperator{{i}}(mace::OperatorDef &op) {
+  mace::Argument *arg = nullptr;
+  {% for arg in net.op[i].arg %}
+  arg = op.add_arg();
+  arg->set_name({{ arg.name|tojson }});
+  {%- if arg.HasField('f') %}
+  arg->set_f({{ arg.f }});
+  {%- endif %}
+  {%- if arg.HasField('i') %}
+  arg->set_i({{ arg.i }});
+  {%- endif %}
+  {%- if arg.HasField('s') %}
+  arg->set_s({{ arg.s|tojson }});
+  {%- endif %}
+  {% if arg.floats|length != 0 %}
+  arg->set_floats({ {{ arg.floats|join(', ') }} });
+  {% endif %}
+  {% if arg.ints|length != 0 %}
+  arg->set_ints({ {{ arg.ints|join(', ') }} });
+  {% endif %}
+  {% if arg.strings|length != 0 %}
+  arg->set_strings({ {{ arg.strings|stringfy() }} });
+  {% endif %}
+  {% endfor %}
+  {% if net.op[i].HasField('mem_id') %}
+  op.set_mem_id({{net.op[i].mem_id}});
+  {% endif %}
+  {% for shape in net.op[i].output_shape %}
+	{% if shape.dims | length > 0 %}
+  op.add_output_shape(mace::OutputShape({ {{ shape.dims|join(', ') }} }));
+	{% endif %}
+  {% endfor %}
+  std::vector<int> output_types_int({ {{ net.op[i].output_type | join(', ') }} });
+  std::vector<mace::DataType> output_types({{ net.op[i].output_type | length }});
+  for (int k = 0; k < {{ net.op[i].output_type | length }}; ++k) {
+    output_types[k] = static_cast<mace::DataType>(output_types_int[k]);
+  }
+  UpdateOp(op, {{ net.op[i].name|tojson }}, {{ net.op[i].type|tojson}},
+          { {{ net.op[i].input|stringfy }} },
+          { {{ net.op[i].output|stringfy }} },
+          output_types,
+          {{ net.op[i].node_id }});
+  {% if runtime == 'dsp' %}
+    op.set_padding({{ net.op[i].padding }});
+    {% if net.op[i].node_input | length > 0 %}
+    std::vector<int> input_node_ids({ {{ net.op[i].node_input | map(attribute='node_id') | join(', ') }} });
+    std::vector<int> input_output_ports({ {{ net.op[i].node_input | map(attribute='output_port') | join(', ')}} });
+    for (size_t i = 0; i < {{ net.op[i].node_input | length }}; ++i) {
+      mace::NodeInput input(input_node_ids[i], input_output_ports[i]);
+      op.add_node_input(input);
+    }
+    {% endif %}
+    {% if net.op[i].out_max_byte_size | length > 0 %}
+    std::vector<int> out_max_byte_sizes {{ net.op[i].out_max_byte_size | replace('[', '{') | replace(']', '}') }};
+    for (size_t i = 0; i < {{ net.op[i].out_max_byte_size | length }}; ++i) {
+      op.add_out_max_byte_size(out_max_byte_sizes[i]);
+    }
+    {% endif %}
+  {% endif %}
+}
+{% endfor %}
+}  // namespace {{tag}}
+}  // namespace mace
+{% else %}
+#include <vector>
+#include <string>
+#include "mace/core/public/mace.h"
+namespace mace {
+namespace {{tag}} {
+{% for tensor in tensors %}
+extern void Create{{ tensor.name }}(std::vector<mace::ConstTensor> &tensors);
+{% endfor %}
+{% for i in range(net.op|length) %}
+extern void CreateOperator{{i}}(mace::OperatorDef &op);
+{% endfor %}
+}  // namespace {{ tag }}
+}  // namespace mace
+namespace {
+{% if net.arg|length != 0 %}
+void CreateNetArg(mace::NetDef &net_def) {
+  net_def.mutable_arg().reserve({{ net.arg|length }});
+  mace::Argument *arg = nullptr;
+  {% for arg in net.arg %}
+  arg = net_def.add_arg();
+  arg->set_name({{ arg.name|tojson }});
+  {%- if arg.HasField('f') %}
+  arg->set_f({{ arg.f }});
+  {% endif %}
+  {%- if arg.HasField('i') %}
+  arg->set_i({{ arg.i }});
+  {% endif %}
+  {%- if arg.HasField('s') %}
+  arg->set_s({{ arg.s|tojson }});
+  {% endif %}
+  {% if arg.floats|length != 0 %}
+  arg->set_floats({ {{ arg.floats|join(', ') }} });
+  {% endif %}
+  {% if arg.ints|length != 0 %}
+  arg->set_ints({ {{ arg.ints|join(', ') }} });
+  {% endif %}
+  {% if arg.strings|length != 0 %}
+  arg->set_strings({ {{ arg.strings|stringfy() }} });
+  {% endif %}
+  {% endfor %}
+}
+{% endif %}
+{% if net.output_info | length > 0 %}
+void CreateOutputInfo(mace::NetDef &net_def) {
+	std::vector<std::vector<int>> dims { {{net.output_info | map(attribute='dims') | join(', ') | replace('[', '{') | replace(']', '}') }} };
+  std::vector<int> data_types_int { {{ net.output_info | map(attribute='data_type') | join(', ') }} };
+  std::vector<mace::DataType> data_types({{ net.output_info | length }});
+  for (int k = 0; k < {{ net.output_info | length }}; ++k) {
+    data_types[k] = static_cast<mace::DataType>(data_types_int[k]);
+  }
+  net_def.mutable_output_info().resize({{ net.output_info | length }});
+  for (int i = 0; i < {{ net.output_info | length }}; ++i) {
+    net_def.mutable_output_info()[i].set_data_type(data_types[i]);
+    net_def.mutable_output_info()[i].set_dims(dims[i]);
+  }
+}
+{% endif %}
+void CreateOperators(std::vector<mace::OperatorDef> &ops) {
+  ops.resize({{ net.op|length }});
+  {% for i in range(net.op|length) %}
+  mace::{{tag}}::CreateOperator{{i}}(ops[{{i}}]);
+  {% endfor %}
+}
+void CreateTensors(std::vector<mace::ConstTensor> &tensors) {
+  tensors.reserve({{ net.tensors|length }});
+  {% for tensor in net.tensors %}
+  mace::{{tag}}::Create{{tensor.name}}(tensors);
+  {% endfor %}
+}
+{% if net.mem_arena.mem_block|length != 0 %}
+void CreateMemoryArena(mace::MemoryArena &mem_arena) {
+  std::vector<mace::MemoryBlock> &mem_block = mem_arena.mutable_mem_block();
+  mem_block.reserve({{ net.mem_arena.mem_block|length }});
+  {% for mem_blk in net.mem_arena.mem_block %}
+  mem_block.emplace_back(mace::MemoryBlock({{ mem_blk.mem_id }},
+                                           {{mem_blk.x}},
+                                           {{mem_blk.y}}));
+  {% endfor %}
+}
+{% endif %}
+}
+namespace mace {
+namespace {{tag}} {
+NetDef CreateNet() {
+  NetDef net_def;
+  net_def.set_name("{{ net.name}}");
+  net_def.set_version("{{ net.version }}");
+  {% if net.arg|length != 0 %}
+  CreateNetArg(net_def);
+  {% endif %}
+  CreateOperators(net_def.mutable_op());
+  CreateTensors(net_def.mutable_tensors());
+  {% if net.mem_arena.mem_block|length != 0 %}
+  CreateMemoryArena(net_def.mutable_mem_arena());
+  {% endif %}
+  {% if net.output_info | length > 0 %}
+  CreateOutputInfo(net_def);
+  {% endif %}
+  return net_def;
+}
+}  // namespace {{tag}}
+}  // namespace mace
+{% endif %}
--- a/python/tools/opencl_codegen.py
+++ b/python/tools/opencl_codegen.py
+import argparse
+import os
+import sys
+import numpy as np
+import jinja2
+# python mace/python/tools/opencl_codegen.py \
+#     --cl_binary_dir=${CL_BIN_DIR} --output_path=${CL_HEADER_PATH}
+FLAGS = None
+def generate_cpp_source():
+  maps = {}
+  for file_name in os.listdir(FLAGS.cl_binary_dir):
+    file_path = os.path.join(FLAGS.cl_binary_dir, file_name)
+    if file_path[-4:] == ".bin":
+      # read binary
+      f = open(file_path, "rb")
+      binary_array = np.fromfile(f, dtype=np.uint8)
+      f.close()
+      maps[file_name[:-4]] = []
+      for ele in binary_array:
+        maps[file_name[:-4]].append(hex(ele))
+  env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
+  return env.get_template('str2vec_maps.cc.tmpl').render(
+    maps = maps,
+    data_type = 'unsigned char',
+    variable_name = 'kCompiledProgramMap'
+  )
+def main(unused_args):
+  if not os.path.exists(FLAGS.cl_binary_dir):
+    print("Input cl_binary_dir " + FLAGS.cl_binary_dir + " doesn't exist!")
+  cpp_cl_binary_source = generate_cpp_source()
+  if os.path.isfile(FLAGS.output_path):
+    os.remove(FLAGS.output_path)
+  w_file = open(FLAGS.output_path, "w")
+  w_file.write(cpp_cl_binary_source)
+  w_file.close()
+def parse_args():
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--cl_binary_dir",
+      type=str,
+      default="./cl_bin/",
+      help="The cl binaries directory.")
+  parser.add_argument(
+      "--output_path",
+      type=str,
+      default="./mace/examples/codegen/opencl/opencl_compiled_program.cc",
+      help="The path of generated C++ header file which contains cl binaries.")
+  return parser.parse_known_args()
+if __name__ == '__main__':
+  FLAGS, unparsed = parse_args()
+  main(unused_args=[sys.argv[0]] + unparsed)
--- a/python/tools/source_converter_lib.py
+++ b/python/tools/source_converter_lib.py
+import struct
+import os
+import uuid
+import numpy as np
+from tensorflow import gfile
+from lib.proto import mace_pb2
+from jinja2 import Environment, FileSystemLoader
+GENERATED_NAME = set()
+def generate_random_name():
+  name = '_' + uuid.uuid4().hex[:7].upper()
+  while name in GENERATED_NAME:
+    name = '_' + uuid.uuid4().hex[:7].upper()
+  GENERATED_NAME.add(name)
+  return name
+def generate_tensor_map(tensors):
+  tensor_map = {}
+  for t in tensors:
+    if not tensor_map.has_key(t.name):
+      tensor_map[t.name] = generate_random_name()
+  return tensor_map
+def generate_in_out_map(ops, tensor_map):
+  in_out_map = {}
+  for op in ops:
+    op.name = generate_random_name()
+    for input_name in op.input:
+        if not in_out_map.has_key(input_name):
+          if tensor_map.has_key(input_name):
+            in_out_map[input_name] = tensor_map[input_name]
+          else:
+            in_out_map[input_name] = generate_random_name()
+    for output_name in op.output:
+      if not in_out_map.has_key(output_name):
+        if tensor_map.has_key(output_name):
+          in_out_map[output_name] = tensor_map[output_name]
+        else:
+          in_out_map[output_name] = generate_random_name()
+  return in_out_map
+def obfuscate_name(net_def):
+  input_node = "mace_input_node"
+  output_node = "mace_output_node"
+  tensor_map = generate_tensor_map(net_def.tensors)
+  in_out_map = generate_in_out_map(net_def.op, tensor_map)
+  for t in net_def.tensors:
+    if input_node not in t.name and output_node not in t.name:
+      t.name = tensor_map[t.name]
+  for op in net_def.op:
+    for i in range(len(op.input)):
+      if input_node not in op.input[i]:
+        op.input[i] = in_out_map[op.input[i]]
+    for i in range(len(op.output)):
+      if output_node not in op.output[i]:
+        op.output[i] = in_out_map[op.output[i]]
+def rename_tensor(net_def):
+  tensor_map = {}
+  for t in net_def.tensors:
+    if not tensor_map.has_key(t.name):
+      tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_")
+      t.name = tensor_map[t.name]
+  for op in net_def.op:
+    for i in range(len(op.input)):
+      if tensor_map.has_key(op.input[i]):
+        op.input[i] = tensor_map[op.input[i]]
+    for i in range(len(op.output)):
+      if tensor_map.has_key(op.output[i]):
+        op.output[i] = tensor_map[op.output[i]]
+class TensorInfo:
+  def __init__(self, t):
+    self.name = t.name
+    self.data_type = mace_pb2.DataType.Name(t.data_type)
+    if t.data_type == mace_pb2.DT_FLOAT:
+      self.data = bytearray(struct.pack('%sf' % len(t.float_data), *t.float_data))
+    elif t.data_type == mace_pb2.DT_INT32:
+      self.data = bytearray(struct.pack('%si' % len(t.int32_data), *t.int32_data))
+    elif t.data_type == mace_pb2.DT_UINT8:
+      self.data = bytearray(np.array(t.int32_data).astype(np.uint8).tolist())
+def stringfy(value):
+  return ', '.join('"{0}"'.format(w) for w in value)
+def convert_to_source(net_def, template, obfuscate, model_tag, output, runtime):
+  if obfuscate:
+    obfuscate_name(net_def)
+  else:
+    rename_tensor(net_def)
+  # Capture our current directory
+  template_dir = os.path.dirname(template)
+  template_name = os.path.basename(template)
+  print template_dir
+  # Create the jinja2 environment.
+  j2_env = Environment(loader=FileSystemLoader(template_dir),
+    trim_blocks=True)
+  j2_env.filters['stringfy'] = stringfy
+  counter = 0
+  output_dir = os.path.dirname(output) + '/'
+  # generate tensor source files
+  for t in net_def.tensors:
+    source = j2_env.get_template(template_name).render(
+      tensor_info = TensorInfo(t),
+      tensor = t,
+      tag = model_tag,
+      mode = 0,
+      runtime = runtime,
+    )
+    with gfile.GFile(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f:
+      f.write(source)
+    counter += 1
+  # generate op source files
+  counter = 0
+  op_size = len(net_def.op)
+  for start in range(0, op_size, 10):
+    source = j2_env.get_template(template_name).render(
+      start = start,
+      end = min(start+10, op_size),
+      net = net_def,
+      tag = model_tag,
+      mode = 1,
+      runtime = runtime,
+    )
+    with gfile.GFile(output_dir + 'op' + str(counter) + '.cc', "wb") as f:
+      f.write(source)
+    counter += 1
+  # generate model source files
+  tensors = [TensorInfo(t) for t in net_def.tensors]
+  source = j2_env.get_template(template_name).render(
+    tensors = tensors,
+    net = net_def,
+    tag = model_tag,
+    mode = 2,
+    runtime = runtime,
+  )
+  with gfile.GFile(output, "wb") as f:
+    f.write(source)
--- a/python/tools/str2vec_maps.cc.tmpl
+++ b/python/tools/str2vec_maps.cc.tmpl
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+// This is a generated file, DO NOT EDIT
+#include <map>
+#include <string>
+#include <vector>
+namespace mace {
+extern const std::map<std::string, std::vector<{{data_type}}>> {{variable_name}}=
+{
+  {% for key, value in maps.iteritems() %}
+  {
+    "{{key}}",
+    {
+      {%- for ele in value -%}
+      {{ele}},
+      {%- endfor -%}
+    }
+  },  // {{key}}
+{% endfor %}
+};
+}  // namespace
--- a/python/tools/tf_converter.py
+++ b/python/tools/tf_converter.py
+import argparse
+import sys
+import tensorflow as tf
+from tensorflow import gfile
+from lib.proto import mace_pb2
+from lib.python.tools import tf_converter_lib
+from lib.python.tools import tf_dsp_converter_lib
+from lib.python.tools import source_converter_lib
+# ./bazel-bin/mace/python/tools/tf_converter --input quantized_test.pb --output quantized_test_dsp.pb --runtime dsp --input_dim input_node,1,28,28,3
+FLAGS = None
+def main(unused_args):
+  if not gfile.Exists(FLAGS.input):
+    print("Input graph file '" + FLAGS.input + "' does not exist!")
+    return -1
+  input_graph_def = tf.GraphDef()
+  with gfile.Open(FLAGS.input, "rb") as f:
+    data = f.read()
+    input_graph_def.ParseFromString(data)
+  if FLAGS.runtime == 'dsp':
+    output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
+      input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize)
+  else:
+    output_graph_def = tf_converter_lib.convert_to_mace_pb(
+      input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.data_type, FLAGS.runtime)
+  if FLAGS.output_type == 'source':
+    source_converter_lib.convert_to_source(output_graph_def, FLAGS.template, FLAGS.obfuscate,
+      FLAGS.model_tag, FLAGS.output, FLAGS.runtime)
+  else:
+    with gfile.GFile(FLAGS.output, "wb") as f:
+      f.write(output_graph_def.SerializeToString())
+    with gfile.GFile(FLAGS.output + '_txt', "wb") as f:
+      # output_graph_def.ClearField('tensors')
+      f.write(str(output_graph_def))
+  print("Model conversion is completed.")
+def str2bool(v):
+  if v.lower() in ('yes', 'true', 't', 'y', '1'):
+    return True
+  elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+    return False
+  else:
+    raise argparse.ArgumentTypeError('Boolean value expected.')
+def parse_args():
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser()
+  parser.register("type", "bool", lambda v: v.lower() == "true")
+  parser.add_argument(
+    "--input",
+    type=str,
+    default="",
+    help="TensorFlow \'GraphDef\' file to load.")
+  parser.add_argument(
+    "--output",
+    type=str,
+    default="",
+    help="File to save the output graph to.")
+  parser.add_argument(
+    "--runtime",
+    type=str,
+    default="cpu",
+    help="Runtime: cpu/gpu/dsp")
+  parser.add_argument(
+    "--input_node",
+    type=str,
+    default="input_node",
+    help="e.g., input_node")
+  parser.add_argument(
+    "--output_node",
+    type=str,
+    default="softmax",
+    help="e.g., softmax")
+  parser.add_argument(
+    "--prequantize",
+    type=bool,
+    default=True,
+    help="e.g., True")
+  parser.add_argument(
+    "--data_type",
+    type=str,
+    default='DT_FLOAT',
+    help="e.g., DT_HALF/DT_FLOAT")
+  parser.add_argument(
+    "--output_type",
+    type=str,
+    default="pb",
+    help="output type: source/pb")
+  parser.add_argument(
+    "--template",
+    type=str,
+    default="",
+    help="template path")
+  parser.add_argument(
+    "--obfuscate",
+    type=str2bool,
+    nargs='?',
+    const=False,
+    default=False,
+    help="obfuscate model names")
+  parser.add_argument(
+    "--model_tag",
+    type=str,
+    default="",
+    help="model tag for generated function and namespace")
+  return parser.parse_known_args()
+if __name__ == '__main__':
+  FLAGS, unparsed = parse_args()
+  main(unused_args=[sys.argv[0]] + unparsed)
--- a/python/tools/tf_converter_lib.py
+++ b/python/tools/tf_converter_lib.py
--- a/python/tools/tf_dsp_converter_lib.py
+++ b/python/tools/tf_dsp_converter_lib.py
+from lib.proto import mace_pb2
+import tensorflow as tf
+from operator import mul
+from dsp_ops import DspOps
+from lib.python.tools import graph_util
+from lib.python.tools.convert_util import tf_dtype_2_mace_dtype
+# converter --input ../libcv/quantized_icnet.pb --output quantized_icnet_dsp.pb \
+# --runtime dsp --input_node input_node --output_node output_node
+padding_mode = {
+  'NA': 0,
+  'SAME': 1,
+  'VALID': 2,
+  'MIRROR_REFLECT': 3,
+  'MIRROR_SYMMETRIC': 4,
+  'SAME_CAFFE': 5
+}
+def get_tensor_name_from_op(op_name, port):
+  return op_name + ':' + str(port)
+def get_node_from_map(op_map, op_or_tensor_name):
+  op_name = op_or_tensor_name.split(':')[0]
+  return op_map[op_name]
+def get_op_and_port_from_tensor(tensor_name):
+  op, port = tensor_name.split(':')
+  port = int(port)
+  return op, port
+def max_elem_size(tensor):
+  if len(tensor.shape.as_list()) == 0:
+    return tensor.dtype.size
+  else:
+    return reduce(mul, tensor.shape.as_list()) * tensor.dtype.size
+def find_dtype(tensor_dtype):
+  if tensor_dtype == tf.float32:
+    return mace_pb2.DT_FLOAT
+  elif tensor_dtype == tf.uint8 or tensor_dtype == tf.quint8:
+    return mace_pb2.DT_UINT8
+  elif tensor_dtype == tf.int32 or tensor_dtype == tf.qint32:
+    return mace_pb2.DT_INT32
+  else:
+    raise Exception('Unsupported data type: ', tensor_dtype)
+def has_padding_and_strides(op):
+  return 'padding' in op.node_def.attr and 'strides' in op.node_def.attr
+def is_node_flatten_reshape(op):
+  return op.type == 'Reshape' and len(op.outputs[0].shape) == 1
+def get_input_tensor(op, index):
+  input_tensor = op.inputs[index]
+  if input_tensor.op.type == 'Reshape':
+    input_tensor = get_input_tensor(input_tensor.op, 0)
+  return input_tensor
+def add_shape_const_node(net_def, op, values, name):
+  print ('Add const node: ', op.name + '/' + name)
+  tensor = net_def.tensors.add()
+  node_name = op.name + '/' + name
+  tensor.name = node_name + ':0'
+  tensor.data_type =  mace_pb2.DT_INT32
+  tensor.dims.extend(values)
+  return tensor.name
+def convert_op_outputs(mace_op_def, tf_op):
+  mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype)
+                                  for output in tf_op.outputs])
+  output_shapes = []
+  for output in tf_op.outputs:
+    output_shape = mace_pb2.OutputShape()
+    output_shape.dims.extend(output.shape.as_list())
+    output_shapes.append(output_shape)
+  mace_op_def.output_shape.extend(output_shapes)
+def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
+  first_op = unresolved_ops[0]
+  print ('Op: ', first_op.name, first_op.type, first_op.outputs[0].shape)
+  if first_op.name in resolved_ops:
+    pass
+  elif first_op.type == 'Const':
+    print ('Add const node: ', first_op.name)
+    tf_tensor = first_op.outputs[0].eval()
+    tensor = net_def.tensors.add()
+    tensor.name = first_op.outputs[0].name
+    tensor.data_type = find_dtype(first_op.outputs[0].dtype)
+    shape = list(tf_tensor.shape)
+    if len(shape) > 0:
+      tensor.dims.extend(shape)
+    if first_op.outputs[0].dtype == tf.float32:
+      tensor.float_data.extend(tf_tensor.astype(float).flat)
+    elif first_op.outputs[0].dtype == tf.int32 or \
+            first_op.outputs[0].dtype == tf.int8 or \
+            first_op.outputs[0].dtype == tf.int16 or \
+            first_op.outputs[0].dtype == tf.quint8 or \
+            first_op.outputs[0].dtype == tf.quint16:
+      tensor.int32_data.extend(tf_tensor.astype(int).flat)
+  else:
+    op_def = net_def.op.add()
+    op_def.name = first_op.name
+    op_def.type = dsp_ops.map_nn_op(first_op.type)
+    op_def.padding = padding_mode['NA']
+    if len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \
+        and len(first_op.outputs[0].consumers()) > 0 \
+        and (first_op.outputs[0].consumers()[0].type == 'SpaceToBatchND' \
+        or first_op.outputs[0].consumers()[0].type == 'BatchToSpaceND'):
+      input_tensor = first_op.inputs[0]
+      min_tensor = first_op.inputs[1]
+      max_tensor = first_op.inputs[2]
+      s2b_op = first_op.outputs[0].consumers()[0]
+      reshape_op = s2b_op.outputs[0].consumers()[0]
+      min_op = reshape_op.outputs[0].consumers()[0]
+      max_op = reshape_op.outputs[0].consumers()[1]
+      quantize_op = min_op.outputs[0].consumers()[0]
+      resolved_ops.add(s2b_op.name)
+      resolved_ops.add(reshape_op.name)
+      resolved_ops.add(min_op.name)
+      resolved_ops.add(max_op.name)
+      resolved_ops.add(quantize_op.name)
+      op_def.name = quantize_op.name
+      op_def.type = dsp_ops.map_nn_op('Quantized' + s2b_op.type)
+      op_def.input.append(input_tensor.name)
+      op_def.input.extend([t.name for t in s2b_op.inputs[1:]])
+      op_def.input.extend([min_tensor.name, max_tensor.name])
+      op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_op.outputs])
+      convert_op_outputs(op_def, quantize_op)
+    elif has_padding_and_strides(first_op):
+      op_def.padding = padding_mode[first_op.get_attr('padding')]
+      op_def.input.extend([t.name for t in first_op.inputs])
+      if 'ksize' in first_op.node_def.attr:
+        ksize = first_op.get_attr('ksize')
+        ksize_tensor = add_shape_const_node(net_def, first_op, ksize, 'ksize')
+        op_def.input.extend([ksize_tensor])
+      strides = first_op.get_attr('strides')
+      strides_tensor = add_shape_const_node(net_def, first_op, strides, 'strides')
+      op_def.input.extend([strides_tensor])
+      op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
+      convert_op_outputs(op_def, first_op)
+    elif is_node_flatten_reshape(first_op):
+      op_def.type = 'Flatten'
+      op_def.input.extend([t.name for t in first_op.inputs])
+      op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
+      convert_op_outputs(op_def, first_op)
+    elif dsp_ops.has_op(first_op.type):
+      op_def.input.extend([t.name for t in first_op.inputs])
+      op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
+      convert_op_outputs(op_def, first_op)
+    else:
+      raise Exception('Unsupported op: ', first_op)
+    resolved_ops.add(first_op.name)
+  del unresolved_ops[0]
+def add_output_node(net_def, output_node):
+  op_def = net_def.op.add()
+  op_def.name = '__output__'
+  op_def.type = 'OUTPUT'
+  op_def.input.extend([get_tensor_name_from_op(output_node, 0)])
+def reverse_batch_to_space_and_biasadd(net_def):
+  tensor_map = {}
+  for tensor in net_def.tensors:
+    tensor_map[tensor.name] = tensor
+  op_map = {}
+  for op in net_def.op:
+    op_map[op.name] = op
+  consumers = {}
+  for op in net_def.op:
+    for ipt in op.input:
+      if ipt not in consumers:
+        consumers[ipt] = []
+      consumers[ipt].append(op)
+  new_ops = []
+  skip_ops = set()
+  visited_ops = set()
+  for op in net_def.op:
+    if op.name in visited_ops:
+      pass
+    # pattern: QConv -> RR -> R -> QB2S -> QBiasAdd -> RR -> R
+    success = False
+    if op.type == 'Requantize_32to8':
+      biasadd_requantize_op = op
+      biasadd_op = get_node_from_map(op_map, biasadd_requantize_op.input[0])
+      if biasadd_op.type == 'QuantizedBiasAdd_8p8to32':
+        b2s_op = get_node_from_map(op_map, biasadd_op.input[0])
+        if b2s_op.type == 'QuantizedBatchToSpaceND_8':
+          conv_requantize_op = get_node_from_map(op_map, b2s_op.input[0])
+          conv_op = get_node_from_map(op_map, conv_requantize_op.input[0])
+          if conv_op.type == 'QuantizedConv2d_8x8to32':
+            new_biasadd_op = mace_pb2.OperatorDef()
+            new_biasadd_op.CopyFrom(biasadd_op)
+            new_biasadd_op.input[0] = get_tensor_name_from_op(conv_requantize_op.name, 0)
+            new_biasadd_op.input[2] = get_tensor_name_from_op(conv_requantize_op.name, 1)
+            new_biasadd_op.input[3] = get_tensor_name_from_op(conv_requantize_op.name, 2)
+            new_biasadd_op.out_max_byte_size[0] = conv_requantize_op.out_max_byte_size[0] * 4
+            new_biasadd_requantize_op = mace_pb2.OperatorDef()
+            new_biasadd_requantize_op.CopyFrom(biasadd_requantize_op)
+            new_biasadd_requantize_op.out_max_byte_size[0] = new_biasadd_op.out_max_byte_size[0] / 4
+            new_b2s_op = mace_pb2.OperatorDef()
+            new_b2s_op.CopyFrom(b2s_op)
+            new_b2s_op.input[0] = get_tensor_name_from_op(biasadd_requantize_op.name, 0)
+            new_b2s_op.input[3] = get_tensor_name_from_op(biasadd_requantize_op.name, 1)
+            new_b2s_op.input[4] = get_tensor_name_from_op(biasadd_requantize_op.name, 2)
+            new_ops.extend([new_biasadd_op, new_biasadd_requantize_op, new_b2s_op])
+            skip_ops = skip_ops.union([biasadd_op.name, biasadd_requantize_op.name, b2s_op.name])
+            visited_ops.add(op.name)
+            follow_ops = consumers[get_tensor_name_from_op(biasadd_requantize_op.name, 0)]
+            for follow_op in follow_ops:
+              new_follow_op = mace_pb2.OperatorDef()
+              new_follow_op.CopyFrom(follow_op)
+              for i in xrange(len(follow_op.input)):
+                for k in xrange(3):
+                  if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k):
+                    new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k)
+              new_ops.append(new_follow_op)
+              skip_ops.add(follow_op.name)
+              visited_ops.add(follow_op.name)
+    visited_ops.add(op.name)
+  new_net_def = mace_pb2.NetDef()
+  new_net_def.tensors.extend(tensor_map.values())
+  new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
+  new_net_def.op.extend(new_ops)
+  return new_net_def
+def add_node_id(net_def):
+  node_id_counter = 0
+  node_id_map = {}
+  for tensor in net_def.tensors:
+    tensor.node_id = node_id_counter
+    node_id_counter += 1
+    tensor_op, port = get_op_and_port_from_tensor(tensor.name)
+    node_id_map[tensor_op] = tensor.node_id
+  for op in net_def.op:
+    op.node_id = node_id_counter
+    node_id_counter += 1
+    node_id_map[op.name] = op.node_id
+    for ipt in op.input:
+      op_name, port = get_op_and_port_from_tensor(ipt)
+      node_id = node_id_map[op_name]
+      node_input = op.node_input.add()
+      node_input.node_id = node_id
+      node_input.output_port = int(port)
+  return net_def
+def add_input_output_info(net_def, input_node, output_node, graph, dtype):
+  input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0))
+  output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0))
+  input_info = net_def.input_info.add()
+  input_info.dims.extend(input_tensor.shape.as_list())
+  input_info.data_type = dtype
+  if dtype == mace_pb2.DT_UINT8:
+    for i in xrange(2):
+      input_info = net_def.input_info.add()
+      input_info.dims.extend([1,1,1,1])
+      input_info.data_type = mace_pb2.DT_FLOAT
+  output_info = net_def.output_info.add()
+  output_info.dims.extend(output_tensor.shape.as_list())
+  output_info.data_type = dtype
+  if dtype == mace_pb2.DT_UINT8:
+    for i in xrange(2):
+      output_info = net_def.output_info.add()
+      output_info.dims.extend([1,1,1,1])
+      output_info.data_type = mace_pb2.DT_FLOAT
+  return net_def
+def strip_input_quantize_and_output_dequantize(net_def, input_node, output_node):
+  tensor_map = {}
+  for tensor in net_def.tensors:
+    tensor_map[tensor.name] = tensor
+  op_map = {}
+  for op in net_def.op:
+    op_map[op.name] = op
+  consumers = {}
+  for op in net_def.op:
+    for ipt in op.input:
+      if ipt not in consumers:
+        consumers[ipt] = []
+      consumers[ipt].append(op)
+  skip_ops = set()
+  new_ops = []
+  skip_tensors = set()
+  # INPUT->Flatten->Minf, Maxf->Quantize
+  for op in net_def.op:
+    if op.type == 'INPUT':
+      input_op = op
+      flatten_op = None
+      quantize_op = None
+      for o in consumers[get_tensor_name_from_op(input_op.name, 0)]:
+        if o.type == 'Flatten':
+          flatten_op = o
+        elif o.type == 'Quantize':
+          quantize_op = o
+      if quantize_op is not None:
+        minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)]
+        skip_ops = skip_ops.union([input_op.name, flatten_op.name, minf_op.name, maxf_op.name, quantize_op.name])
+        skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
+        new_input_op = mace_pb2.OperatorDef()
+        new_input_op.name = input_op.name
+        new_input_op.type = input_op.type
+        new_input_op.padding = input_op.padding
+        new_input_op.out_max_byte_size.extend([input_op.out_max_byte_size[0]/4, 4, 4])
+        new_ops.append(new_input_op)
+        new_input_op.output_shape.extend([input_op.output_shape[0],
+                                          minf_op.output_shape[0],
+                                          maxf_op.output_shape[0]])
+        new_input_op.output_type.extend([input_op.output_type[0], mace_pb2.DT_FLOAT, mace_pb2.DT_FLOAT])
+        for follow_op in consumers[get_tensor_name_from_op(quantize_op.name, 0)]:
+          new_follow_op = mace_pb2.OperatorDef()
+          new_follow_op.CopyFrom(follow_op)
+          for i in xrange(len(follow_op.input)):
+            for k in xrange(3):
+              if new_follow_op.input[i] == get_tensor_name_from_op(quantize_op.name, k):
+                new_follow_op.input[i] = get_tensor_name_from_op(input_op.name, k)
+          new_ops.append(new_follow_op)
+          skip_ops.add(follow_op.name)
+    elif op.type == 'OUTPUT':
+      output_op = op
+      dequantize_op = get_node_from_map(op_map, output_op.input[0])
+      if dequantize_op.type == 'Dequantize':
+        skip_ops = skip_ops.union([dequantize_op.name, output_op.name])
+        new_output_op = mace_pb2.OperatorDef()
+        new_output_op.name = output_op.name
+        new_output_op.type = output_op.type
+        new_output_op.input.extend(dequantize_op.input)
+        new_ops.append(new_output_op)
+  new_net_def = mace_pb2.NetDef()
+  new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors])
+  new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
+  new_net_def.op.extend(new_ops)
+  return new_net_def
+def convert_to_mace_pb(input_graph_def, input_node, output_node, prequantize=False):
+  """
+    nnlib does not have batch norm, so use tensorflow optimizer to fold
+     batch norm with convolution. The fold optimization reorders ops, so
+     we sort ops first by topology.
+  """
+  input_graph_def = graph_util.sort_tf_graph(input_graph_def)
+  net_def = mace_pb2.NetDef()
+  with tf.Session() as session:
+    with session.graph.as_default() as graph:
+      tf.import_graph_def(input_graph_def, name="")
+      ops = graph.get_operations()
+      dsp_ops = DspOps()
+      resolved_ops = set()
+      # convert const node
+      unresolved_ops = [op for op in ops if op.type == 'Const']
+      while len(unresolved_ops) > 0:
+        convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops)
+      # convert op node
+      unresolved_ops = [op for op in ops if op.type != 'Const']
+      while len(unresolved_ops) > 0:
+        convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops)
+      add_output_node(net_def, output_node)
+      # optimized_net_def = reverse_batch_to_space_and_biasadd(net_def)
+      if prequantize:
+        print('Prequantize ...')
+        net_def = strip_input_quantize_and_output_dequantize(net_def, input_node, output_node)
+      sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__')
+      net_def_with_node_id = add_node_id(sorted_net_def)
+      if prequantize:
+        dtype = mace_pb2.DT_UINT8
+      else:
+        dtype = mace_pb2.DT_FLOAT
+      final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype)
+  return final_net_def