move tools to libmace

a022085e · yejianwu · 53db6a6c · a022085e · a022085e · 53db6a6c
22 changed file
--- a/mace/codegen/BUILD
+++ b/mace/codegen/BUILD
@@ -21,18 +21,6 @@ cc_library(
    linkstatic = 1,
 )

-cc_library(
-    name = "generated_opencl_prod",
-    srcs = ["opencl/opencl_compiled_program.cc"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "generated_tuning_params",
-    srcs = ["tuning/tuning_params.cc"],
-    linkstatic = 1,
-)
-
 cc_library(
    name = "generated_version",
    srcs = ["version/version.cc"],

--- a/mace/examples/BUILD
+++ b/mace/examples/BUILD
@@ -24,14 +24,3 @@ cc_test(
        "//mace/core:test_benchmark_main",
    ],
 )
-
-cc_binary(
-    name = "mace_run",
-    srcs = ["mace_run.cc"],
-    linkopts = if_neon_enabled(["-fopenmp"]),
-    linkstatic = 1,
-    deps = [
-        "//mace/codegen:generated_models",
-        "//mace/utils:command_line_flags",
-    ],
-)
--- a/mace/examples/mace_run.cc
+++ b/mace/examples/mace_run.cc
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-/**
- * Usage:
- * mace_run --model=mobi_mace.pb \
- *          --input=input_node  \
- *          --output=MobilenetV1/Logits/conv2d/convolution  \
- *          --input_shape=1,224,224,3   \
- *          --output_shape=1,224,224,2   \
- *          --input_file=input_data \
- *          --output_file=mace.out  \
- *          --device=NEON
- */
-#include <malloc.h>
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <numeric>
-#include "mace/utils/command_line_flags.h"
-#include "mace/utils/env_time.h"
-#include "mace/utils/logging.h"
-
-#include "mace/core/public/mace.h"
-
-using namespace std;
-using namespace mace;
-
-namespace mace {
-namespace MACE_MODEL_TAG {
-
-extern NetDef CreateNet();
-
-}
-}
-
-void ParseShape(const string &str, vector<int64_t> *shape) {
-  string tmp = str;
-  while (!tmp.empty()) {
-    int dim = atoi(tmp.data());
-    shape->push_back(dim);
-    size_t next_offset = tmp.find(",");
-    if (next_offset == string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-}
-
-DeviceType ParseDeviceType(const string &device_str) {
-  if (device_str.compare("CPU") == 0) {
-    return DeviceType::CPU;
-  } else if (device_str.compare("NEON") == 0) {
-    return DeviceType::NEON;
-  } else if (device_str.compare("OPENCL") == 0) {
-    return DeviceType::OPENCL;
-  } else if (device_str.compare("HEXAGON") == 0) {
-    return DeviceType::HEXAGON;
-  } else {
-    return DeviceType::CPU;
-  }
-}
-
-struct mallinfo LogMallinfoChange(struct mallinfo prev) {
-  struct mallinfo curr = mallinfo();
-  if (prev.arena != curr.arena) {
-    LOG(INFO) << "Non-mmapped space allocated (bytes): " << curr.arena
-              << ", diff: " << ((int64_t)curr.arena - (int64_t)prev.arena);
-  }
-  if (prev.ordblks != curr.ordblks) {
-    LOG(INFO) << "Number of free chunks: " << curr.ordblks
-              << ", diff: " << ((int64_t)curr.ordblks - (int64_t)prev.ordblks);
-  }
-  if (prev.smblks != curr.smblks) {
-    LOG(INFO) << "Number of free fastbin blocks: " << curr.smblks
-              << ", diff: " << ((int64_t)curr.smblks - (int64_t)prev.smblks);
-  }
-  if (prev.hblks != curr.hblks) {
-    LOG(INFO) << "Number of mmapped regions: " << curr.hblks
-              << ", diff: " << ((int64_t)curr.hblks - (int64_t)prev.hblks);
-  }
-  if (prev.hblkhd != curr.hblkhd) {
-    LOG(INFO) << "Space allocated in mmapped regions (bytes): " << curr.hblkhd
-              << ", diff: " << ((int64_t)curr.hblkhd - (int64_t)prev.hblkhd);
-  }
-  if (prev.usmblks != curr.usmblks) {
-    LOG(INFO) << "Maximum total allocated space (bytes): " << curr.usmblks
-              << ", diff: " << ((int64_t)curr.usmblks - (int64_t)prev.usmblks);
-  }
-  if (prev.fsmblks != curr.fsmblks) {
-    LOG(INFO) << "Space in freed fastbin blocks (bytes): " << curr.fsmblks
-              << ", diff: " << ((int64_t)curr.fsmblks - (int64_t)prev.fsmblks);
-  }
-  if (prev.uordblks != curr.uordblks) {
-    LOG(INFO) << "Total allocated space (bytes): " << curr.uordblks
-              << ", diff: "
-              << ((int64_t)curr.uordblks - (int64_t)prev.uordblks);
-  }
-  if (prev.fordblks != curr.fordblks) {
-    LOG(INFO) << "Total free space (bytes): " << curr.fordblks << ", diff: "
-              << ((int64_t)curr.fordblks - (int64_t)prev.fordblks);
-  }
-  if (prev.keepcost != curr.keepcost) {
-    LOG(INFO) << "Top-most, releasable space (bytes): " << curr.keepcost
-              << ", diff: "
-              << ((int64_t)curr.keepcost - (int64_t)prev.keepcost);
-  }
-  return curr;
-}
-
-int main(int argc, char **argv) {
-  string input_shape;
-  string output_shape;
-  string input_file;
-  string output_file;
-  string device;
-  int round = 1;
-  int malloc_check_cycle = -1;
-
-  std::vector<Flag> flag_list = {
-      Flag("input_shape", &input_shape, "input shape, separated by comma"),
-      Flag("output_shape", &output_shape, "output shape, separated by comma"),
-      Flag("input_file", &input_file, "input file name"),
-      Flag("output_file", &output_file, "output file name"),
-      Flag("device", &device, "CPU/NEON/OPENCL/HEXAGON"),
-      Flag("round", &round, "round"),
-      Flag("malloc_check_cycle", &malloc_check_cycle,
-           "malloc debug check cycle, -1 to disable"),
-  };
-
-  string usage = Flags::Usage(argv[0], flag_list);
-  const bool parse_result = Flags::Parse(&argc, argv, flag_list);
-
-  if (!parse_result) {
-    LOG(ERROR) << usage;
-    return -1;
-  }
-
-  VLOG(0) << "mace version: " << MaceVersion() << std::endl
-          << "mace git version: " << MaceGitVersion() << std::endl
-          << "input_shape: " << input_shape << std::endl
-          << "output_shape: " << output_shape << std::endl
-          << "input_file: " << input_file << std::endl
-          << "output_file: " << output_file << std::endl
-          << "device: " << device << std::endl
-          << "round: " << round << std::endl;
-
-  vector<int64_t> input_shape_vec;
-  vector<int64_t> output_shape_vec;
-  ParseShape(input_shape, &input_shape_vec);
-  ParseShape(output_shape, &output_shape_vec);
-
-  // load model
-  int64_t t0 = utils::NowMicros();
-  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet();
-  int64_t t1 = utils::NowMicros();
-  LOG(INFO) << "CreateNetDef duration: " << t1 - t0 << " us";
-  int64_t init_micros = t1 - t0;
-
-  DeviceType device_type = ParseDeviceType(device);
-  VLOG(1) << "Device Type" << device_type;
-  int64_t input_size = std::accumulate(input_shape_vec.begin(),
-      input_shape_vec.end(), 1, std::multiplies<int64_t>());
-  int64_t output_size = std::accumulate(output_shape_vec.begin(),
-      output_shape_vec.end(), 1, std::multiplies<int64_t>());
-  std::unique_ptr<float[]> input_data(new float[input_size]);
-  std::unique_ptr<float[]> output_data(new float[output_size]);
-
-  // load input
-  ifstream in_file(input_file, ios::in | ios::binary);
-  if (in_file.is_open()) {
-    in_file.read(reinterpret_cast<char *>(input_data.get()),
-                 input_size * sizeof(float));
-    in_file.close();
-  } else {
-    LOG(FATAL) << "Open input file failed";
-  }
-
-  // Init model
-  VLOG(0) << "Run init";
-  t0 = utils::NowMicros();
-  mace::MaceEngine engine(&net_def, device_type);
-  t1 = utils::NowMicros();
-  init_micros += t1 - t0;
-  LOG(INFO) << "Net init duration: " << t1 - t0 << " us";
-
-  LOG(INFO) << "Total init duration: " << init_micros << " us";
-
-  VLOG(0) << "Warm up";
-  t0 = utils::NowMicros();
-  engine.Run(input_data.get(), input_shape_vec, output_data.get());
-  t1 = utils::NowMicros();
-  LOG(INFO) << "1st warm up run duration: " << t1 - t0 << " us";
-
-  if (round > 0) {
-    VLOG(0) << "Run model";
-    t0 = utils::NowMicros();
-    struct mallinfo prev = mallinfo();
-    for (int i = 0; i < round; ++i) {
-      engine.Run(input_data.get(), input_shape_vec, output_data.get());
-      if (malloc_check_cycle >= 1 && i % malloc_check_cycle == 0) {
-        LOG(INFO) << "=== check malloc info change #" << i << " ===";
-        prev = LogMallinfoChange(prev);
-      }
-    }
-    t1 = utils::NowMicros();
-    LOG(INFO) << "Avg duration: " << (t1 - t0) / round << " us";
-  }
-
-  if (output_data != nullptr) {
-    ofstream out_file(output_file, ios::binary);
-    out_file.write((const char *) (output_data.get()),
-                   output_size * sizeof(float));
-    out_file.flush();
-    out_file.close();
-    LOG(INFO) << "Write output file done.";
-  } else {
-    LOG(ERROR) << "output data is null";
-  }
-}
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
-syntax = "proto2";
-
-package mace;
-
-enum NetMode {
-  INIT   = 0;
-  NORMAL = 1;
-}
-
-enum DeviceType {
-  CPU    = 0;                    // In default, we will use CPU.
-  NEON   = 1;
-  OPENCL = 2;
-}
-
-enum DataType {
-  DT_INVALID = 0;
-
-  // Data types that all computation devices are expected to be
-  // capable to support.
-  DT_FLOAT = 1;
-  DT_DOUBLE = 2;
-  DT_INT32 = 3;
-  DT_UINT8 = 4;
-  DT_INT16 = 5;
-  DT_INT8 = 6;
-  DT_STRING = 7;
-  DT_INT64 = 8;
-  DT_UINT16 = 9;
-  DT_BOOL = 10;
-  DT_HALF = 19;
-  DT_UINT32 = 22;
-}
-
-message TensorProto {
-  // The dimensions in the tensor.
-  repeated int64 dims = 1;
-  optional DataType data_type = 2 [default = DT_FLOAT];
-  // For float
-  repeated float float_data = 3 [packed = true];
-  // For int32, uint8, int8, uint16, int16, bool, and float16
-  // Note about float16: in storage we will basically convert float16 byte-wise
-  // to unsigned short and then store them in the int32_data field.
-  repeated int32 int32_data = 4 [packed = true];
-  // For bytes
-  optional bytes byte_data = 5;
-  // For strings
-  repeated bytes string_data = 6;
-  // For double
-  repeated double double_data = 9 [packed = true];
-  // For int64
-  repeated int64 int64_data = 10 [packed = true];
-  // Optionally, a name for the tensor.
-  optional string name = 7;
-
-  optional uint32 node_id = 100;
-}
-
-message Argument {
-  optional string name = 1;
-  optional float f = 2;
-  optional int64 i = 3;
-  optional bytes s = 4;
-  repeated float floats = 5;
-  repeated int64 ints = 6;
-  repeated bytes strings = 7;
-}
-
-// for hexagon mace-nnlib
-message NodeInput {
-  optional int32 node_id = 1;
-  optional int32 output_port = 2;
-}
-
-message OutputShape {
-  repeated int64 dims = 1;
-}
-
-message OperatorDef {
-  repeated string input = 1;
-  repeated string output = 2;
-  optional string name = 3;
-  optional string type = 4;
-  repeated Argument arg = 5;
-  repeated OutputShape output_shape = 6;
-  repeated DataType output_type = 7;
-
-  // Memory optimization: only support one single output op
-  optional int32 mem_id = 10 [default = -1];
-
-  // for hexagon mace-nnlib
-  optional uint32 node_id = 100;
-  optional uint32 op_id = 101;
-  optional uint32 padding = 102;
-  repeated NodeInput node_input = 103;
-  repeated int32 out_max_byte_size = 104; // only support 32-bit len
-}
-
-// for memory optimization
-message MemoryBlock {
-  optional int32 mem_id = 1;
-  optional uint32 x = 2;
-  optional uint32 y = 3;
-}
-message MemoryArena {
-  repeated MemoryBlock mem_block = 1;
-}
-
-// for hexagon mace-nnlib
-message InputInfo {
-  optional string name = 1;
-  optional int32 node_id = 2;
-  repeated int32 dims = 3;
-  optional int32 max_byte_size = 4; // only support 32-bit len
-  optional DataType data_type = 5 [default = DT_FLOAT];
-}
-message OutputInfo {
-  optional string name = 1;
-  optional int32 node_id = 2;
-  repeated int32 dims = 3;
-  optional int32 max_byte_size = 4; // only support 32-bit len
-  optional DataType data_type = 5 [default = DT_FLOAT];
-}
-
-message NetDef {
-  optional string name = 1;
-  repeated OperatorDef op = 2;
-  optional string version = 3;
-  repeated Argument arg = 4;
-  repeated TensorProto tensors = 5;
-
-  // for mem optimization
-  optional MemoryArena mem_arena = 10;
-
-  // for hexagon mace-nnlib
-  repeated InputInfo input_info = 100;
-  repeated OutputInfo output_info = 101;
-}
--- a/mace/python/tools/BUILD
+++ b/mace/python/tools/BUILD
-py_library(
-    name = "tf_converter_lib",
-    srcs = [
-        "convert_util.py",
-        "graph_util.py",
-        "tf_converter_lib.py",
-        "tf_dsp_converter_lib.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":memory_optimizer",
-        "//mace/proto:mace_py",
-    ],
-)
-
-py_library(
-    name = "source_converter_lib",
-    srcs = [
-        "source_converter_lib.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//mace/proto:mace_py",
-    ],
-)
-
-py_binary(
-    name = "tf_converter",
-    srcs = ["tf_converter.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":tf_converter_lib",
-        ":source_converter_lib",
-        "@six_archive//:six",
-    ],
-)
-
-py_binary(
-    name = "memory_optimizer",
-    srcs = ["memory_optimizer.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//mace/proto:mace_py",
-    ],
-)
-
 py_binary(
    name = "caffe_ops_stats",
    srcs = ["caffe_ops_stats.py"],

--- a/mace/python/tools/binary_codegen.py
+++ b/mace/python/tools/binary_codegen.py
-import argparse
-import os
-import sys
-import struct
-
-import jinja2
-
-import numpy as np
-
-# python mace/python/tools/binary_codegen.py \
-#     --binary_file=${BIN_FILE} --output_path=${CODE_GEN_PATH} --variable_name=kTuningParamsData
-
-FLAGS = None
-
-
-def generate_cpp_source():
-  data_map = {}
-  if not os.path.exists(FLAGS.binary_file):
-    env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
-    return env.get_template('str2vec_maps.cc.tmpl').render(
-      maps=data_map,
-      data_type='unsigned int',
-      variable_name=FLAGS.variable_name
-    )
-
-  with open(FLAGS.binary_file, "rb") as binary_file:
-    binary_array = np.fromfile(binary_file, dtype=np.uint8)
-
-  idx = 0
-  size, = struct.unpack("Q", binary_array[idx:idx+8])
-  print size
-  idx += 8
-  for _ in xrange(size):
-    key_size, = struct.unpack("i", binary_array[idx:idx+4])
-    idx += 4
-    key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size])
-    idx += key_size
-    params_size, = struct.unpack("i", binary_array[idx:idx+4])
-    idx += 4
-    data_map[key] = []
-    count = params_size / 4
-    params = struct.unpack(str(count) + "i", binary_array[idx:idx+params_size])
-    for i in params:
-      data_map[key].append(i)
-    idx += params_size
-
-  env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
-  return env.get_template('str2vec_maps.cc.tmpl').render(
-    maps = data_map,
-    data_type = 'unsigned int',
-    variable_name = FLAGS.variable_name
-  )
-
-def main(unused_args):
-  cpp_binary_source = generate_cpp_source()
-  if os.path.isfile(FLAGS.output_path):
-    os.remove(FLAGS.output_path)
-  w_file = open(FLAGS.output_path, "w")
-  w_file.write(cpp_binary_source)
-  w_file.close()
-
-def parse_args():
-  """Parses command line arguments."""
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      "--binary_file",
-      type=str,
-      default="",
-      help="The binaries file path.")
-  parser.add_argument(
-      "--output_path",
-      type=str,
-      default="",
-      help="The path of generated C++ source file which contains the binary.")
-  parser.add_argument(
-    "--variable_name",
-    type=str,
-    default="kTuningParamsData",
-    help="global variable name.")
-  return parser.parse_known_args()
-
-
-if __name__ == '__main__':
-  FLAGS, unparsed = parse_args()
-  main(unused_args=[sys.argv[0]] + unparsed)
--- a/mace/python/tools/convert_util.py
+++ b/mace/python/tools/convert_util.py
-import tensorflow as tf
-from mace.proto import mace_pb2
-
-TF_DTYPE_2_MACE_DTYPE_MAP = {
-    tf.float32: mace_pb2.DT_FLOAT,
-    tf.double: mace_pb2.DT_DOUBLE,
-    tf.half: mace_pb2.DT_HALF,
-    tf.int64: mace_pb2.DT_INT64,
-    tf.int32: mace_pb2.DT_INT32,
-    tf.qint32: mace_pb2.DT_INT32,
-    tf.int16: mace_pb2.DT_INT16,
-    tf.qint16: mace_pb2.DT_INT16,
-    tf.int8: mace_pb2.DT_INT8,
-    tf.qint8: mace_pb2.DT_INT8,
-    tf.quint16: mace_pb2.DT_UINT16,
-    tf.uint16: mace_pb2.DT_UINT16,
-    tf.quint8: mace_pb2.DT_UINT8,
-    tf.uint8: mace_pb2.DT_UINT8,
-    tf.string: mace_pb2.DT_STRING,
-    tf.bool: mace_pb2.DT_BOOL,
-}
-
-
-def tf_dtype_2_mace_dtype(tf_dtype):
-    mace_dtype = TF_DTYPE_2_MACE_DTYPE_MAP.get(tf_dtype, None)
-    if not mace_dtype:
-        raise Exception("Not supported tensorflow dtype: " + tf_dtype)
-    return mace_dtype
-
--- a/mace/python/tools/dsp_ops.py
+++ b/mace/python/tools/dsp_ops.py
-
-class DspOps(object):
-  def __init__(self):
-    self.dsp_ops = {
-      'INPUT': 'INPUT"',
-      'OUTPUT': 'OUTPUT',
-      'NoOp': 'Nop',
-      'FLATTEN': 'Flatten',
-      'Identity': 'Nop',
-      'Placeholder': 'INPUT',
-      'Const': 'Const',
-      'QuantizedConv2D': 'QuantizedConv2d_8x8to32',
-      'QuantizedMatMul': 'QuantizedMatMul_8x8to32',
-      'QuantizeDownAndShrinkRange': 'QuantizeDownAndShrinkRange_32to8',
-      'QuantizedRelu': 'QuantizedRelu_8',
-      'QuantizedReluX': 'QuantizedReluX_8',
-      'QuantizedMaxPool': 'QuantizedMaxPool_8',
-      'QuantizedAvgPool': 'QuantizedAvgPool_8',
-      'QuantizedConcat': 'QuantizedConcat_8',
-      'QuantizedBiasAdd': 'QuantizedBiasAdd_8p8to32',
-      'QuantizedResizeBilinear' : 'QuantizedResizeBilinear_8',
-      'QuantizedSpaceToBatchND': 'QuantizedSpaceToBatchND_8',
-      'QuantizedBatchToSpaceND': 'QuantizedBatchToSpaceND_8',
-      'Min': 'Min_f',
-      'Max': 'Max_f',
-      'QuantizeV2': 'Quantize',
-      'Dequantize': 'Dequantize',
-      'Softmax': 'Softmax_f',
-      'Reshape': 'Reshape',
-      'QuantizedReshape': 'QuantizedReshape',
-      'Sigmoid': 'Sigmoid_f',
-      'Slice': 'Slice_f',
-      'Add': 'Add_f',
-      'Mul': 'Mul_f',
-      'Requantize': 'Requantize_32to8',
-      'RequantizationRange': 'RequantizationRange_32',
-      'Sub': 'Sub_f',
-      'Pack': 'Pack_int32',
-      'StridedSlice': 'StridedSlice_f',
-      'ExpandDims': 'ExpandDims_f',
-      'QuantizedMul': 'QuantizedMul_8x8to32',
-      'QuantizedAdd': 'QuantizedAdd_8p8to32',
-      'Pad': 'Pad_f',
-      'SpaceToBatchND': 'SpaceToBatchND_f',
-      'BatchToSpaceND': 'BatchToSpaceND_f',
-      'ResizeBilinear': 'ResizeBilinear_f',
-      'ConcatV2': 'ConcatV2_f',
-      'Conv2DBackpropInput': 'Deconv_f',
-      'Tanh': 'Tanh_f',
-      'Split': 'Split_f',
-      'Transpose': 'Transpose_f',
-      'Concat': 'Concat_f',
-      'AddN': 'AddN_f',
-    }
-  def has_op(self, tf_op):
-    return tf_op in self.dsp_ops
-
-  def map_nn_op(self, tf_op):
-    if tf_op not in self.dsp_ops:
-      raise Exception('Could not map nn op for: ', tf_op)
-    return self.dsp_ops[tf_op]
-
-
--- a/mace/python/tools/graph_util.py
+++ b/mace/python/tools/graph_util.py
-import tensorflow as tf
-from mace.proto import mace_pb2
-from collections import OrderedDict
-
-def sort_tf_node(node, nodes_map, ordered_nodes_map):
-    if node.name not in ordered_nodes_map:
-        for input_tensor_name in node.input:
-            input_node_name = input_tensor_name.split(':')[
-                0] if ':' in input_tensor_name else input_tensor_name
-            if input_node_name not in nodes_map or input_node_name in ordered_nodes_map:
-                continue
-
-            input_node = nodes_map[input_node_name]
-            sort_tf_node(input_node, nodes_map, ordered_nodes_map)
-        ordered_nodes_map[node.name] = node
-
-def sort_tf_graph(graph_def):
-    nodes_map = {}
-    ordered_nodes_map = OrderedDict()
-    for node in graph_def.node:
-        nodes_map[node.name] = node
-    for node in graph_def.node:
-        sort_tf_node(node, nodes_map, ordered_nodes_map)
-    sorted_graph = tf.GraphDef()
-    sorted_graph.node.extend([node for node in ordered_nodes_map.values()])
-    return sorted_graph
-
-
-def sort_mace_node(node, nodes_map, ordered_nodes_map):
-    if node.name not in ordered_nodes_map:
-        for input_tensor_name in node.input:
-            input_node_name = input_tensor_name.split(':')[
-                0] if ':' in input_tensor_name else input_tensor_name
-            if input_node_name not in nodes_map or input_node_name in ordered_nodes_map:
-                continue
-
-            input_node = nodes_map[input_node_name]
-            sort_mace_node(input_node, nodes_map, ordered_nodes_map)
-        ordered_nodes_map[node.name] = node
-
-def sort_mace_graph(graph_def, output_name):
-    nodes_map = {}
-    ordered_nodes_map = OrderedDict()
-    for node in graph_def.op:
-        nodes_map[node.name] = node
-    sort_mace_node(nodes_map[output_name], nodes_map, ordered_nodes_map)
-    sorted_graph = mace_pb2.NetDef()
-    sorted_graph.tensors.extend(graph_def.tensors)
-    sorted_graph.op.extend([node for node in ordered_nodes_map.values()])
-    return sorted_graph
\ No newline at end of file
--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
-import sys
-import operator
-from mace.proto import mace_pb2
-
-class MemoryOptimizer(object):
-  def __init__(self, net_def):
-    self.net_def = net_def
-    self.idle_mem = set()
-    self.op_mem = {}    # op_name->mem_id
-    self.mem_block = {} # mem_id->[x, y]
-    self.total_mem_count = 0
-    self.ref_counter = {}
-
-    consumers = {}
-    for op in net_def.op:
-      if self.is_buffer_image_op(op):
-        continue
-      for ipt in op.input:
-        if ipt not in consumers:
-          consumers[ipt] = []
-        consumers[ipt].append(op)
-    # only ref op's output tensor
-    for op in net_def.op:
-      if self.is_buffer_image_op(op):
-        continue
-      tensor_name = op.output[0]
-      if tensor_name in consumers:
-        self.ref_counter[tensor_name] = len(consumers[tensor_name])
-      else:
-        self.ref_counter[tensor_name] = 0
-
-  def is_buffer_image_op(self, op):
-    return op.type == 'BufferToImage' or op.type == 'ImageToBuffer'
-
-  def optimize(self):
-    for op in self.net_def.op:
-      if self.is_buffer_image_op(op):
-        continue
-      if len(self.idle_mem) == 0:
-        # allocate new mem
-        mem_id = self.total_mem_count
-        self.total_mem_count += 1
-      else:
-        # reuse mem
-        mem_id = self.idle_mem.pop()
-
-      if not op.output_shape:
-        print('WARNING: There is no output shape information to do memory optimization.')
-        return
-      op.mem_id = mem_id
-      self.op_mem[op.output[0]] = mem_id
-      if mem_id not in self.mem_block:
-        self.mem_block[mem_id] = [0, 0]
-      mem_size = self.mem_block[mem_id]
-      mem_size[1] = max(mem_size[1], op.output_shape[0].dims[0] * op.output_shape[0].dims[1])
-      mem_size[0] = max(mem_size[0], op.output_shape[0].dims[2] * int((op.output_shape[0].dims[3]+3)/4))
-
-      # de-ref input tensor mem
-      for ipt in op.input:
-        if ipt in self.ref_counter:
-          self.ref_counter[ipt] -= 1
-          if self.ref_counter[ipt] == 0:
-            self.idle_mem.add(self.op_mem[ipt])
-          elif self.ref_counter[ipt] < 0:
-            raise Exception('ref count is less than 0')
-
-    for mem in self.mem_block:
-      arena = self.net_def.mem_arena
-      block = arena.mem_block.add()
-      block.mem_id = mem
-      block.x = self.mem_block[mem][0]
-      block.y = self.mem_block[mem][1]
-
-    print('total op: %d', len(self.net_def.op))
-    origin_mem_size = 0
-    optimized_mem_size = 0
-    for op in self.net_def.op:
-      if self.is_buffer_image_op(op):
-        continue
-      origin_mem_size += reduce(operator.mul, op.output_shape[0].dims, 1)
-    for mem in self.mem_block:
-      print mem, self.mem_block[mem]
-      optimized_mem_size += reduce(operator.mul, self.mem_block[mem], 4)
-
-    print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)
-
-
-def optimize_memory(net_def):
-  mem_optimizer = MemoryOptimizer(net_def)
-  mem_optimizer.optimize()
\ No newline at end of file
--- a/mace/python/tools/model.template
+++ b/mace/python/tools/model.template
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-// Generated by the mace converter.  DO NOT EDIT!
-//
-
-{% if mode == 0 %}
-#include <vector>
-#include "mace/core/public/mace.h"
-
-namespace mace {
-namespace {{tag}} {
-
-{% if tensor_info.data_type != 'DT_UINT8' %} alignas(4) {% endif %} unsigned char {{ tensor_info.name }}[] = {
-{% for d in tensor_info.data %}{{"0x%02X, " % d }}{%endfor%}
-};
-
-void Create{{tensor.name}}(std::vector<mace::ConstTensor> &tensors) {
-  tensors.emplace_back(mace::ConstTensor(
-      {{ tensor.name|tojson }}, {{ tensor.name }},
-      { {{ tensor.dims|join(', ') }} }, {{ tensor.data_type }}, {{ tensor.node_id }}));
-}
-
-}  // namespace {{tag}}
-}  // namespace mace
-
-{% elif mode == 1 %}
-#include <vector>
-#include <string>
-#include "mace/core/public/mace.h"
-
-namespace {
-void UpdateOp(mace::OperatorDef &op,
-              const std::string &name,
-              const std::string &type,
-              const std::vector<std::string> &inputs,
-              const std::vector<std::string> &outputs,
-              const std::vector<mace::DataType> &output_types,
-              uint32_t node_id) {
-  op.set_name(name);
-  op.set_type(type);
-  op.set_input(inputs);
-  op.set_output(outputs);
-  op.set_output_type(output_types);
-  op.set_node_id(node_id);
-}
-}
-
-namespace mace {
-namespace {{tag}} {
-
-{% for i in range(start, end) %}
-
-void CreateOperator{{i}}(mace::OperatorDef &op) {
-  mace::Argument *arg = nullptr;
-  {% for arg in net.op[i].arg %}
-
-  arg = op.add_arg();
-  arg->set_name({{ arg.name|tojson }});
-
-  {%- if arg.HasField('f') %}
-  arg->set_f({{ arg.f }});
-  {%- endif %}
-  {%- if arg.HasField('i') %}
-  arg->set_i({{ arg.i }});
-  {%- endif %}
-  {%- if arg.HasField('s') %}
-  arg->set_s({{ arg.s|tojson }});
-  {%- endif %}
-
-  {% if arg.floats|length != 0 %}
-  arg->set_floats({ {{ arg.floats|join(', ') }} });
-  {% endif %}
-  {% if arg.ints|length != 0 %}
-  arg->set_ints({ {{ arg.ints|join(', ') }} });
-  {% endif %}
-  {% if arg.strings|length != 0 %}
-  arg->set_strings({ {{ arg.strings|stringfy() }} });
-  {% endif %}
-  {% endfor %}
-
-  {% if net.op[i].HasField('mem_id') %}
-  op.set_mem_id({{net.op[i].mem_id}});
-  {% endif %}
-
-  {% for shape in net.op[i].output_shape %}
-	{% if shape.dims | length > 0 %}
-  op.add_output_shape(mace::OutputShape({ {{ shape.dims|join(', ') }} }));
-	{% endif %}
-  {% endfor %}
-
-  std::vector<int> output_types_int({ {{ net.op[i].output_type | join(', ') }} });
-  std::vector<mace::DataType> output_types({{ net.op[i].output_type | length }});
-  for (int k = 0; k < {{ net.op[i].output_type | length }}; ++k) {
-    output_types[k] = static_cast<mace::DataType>(output_types_int[k]);
-  }
-  UpdateOp(op, {{ net.op[i].name|tojson }}, {{ net.op[i].type|tojson}},
-          { {{ net.op[i].input|stringfy }} },
-          { {{ net.op[i].output|stringfy }} },
-          output_types,
-          {{ net.op[i].node_id }});
-
-  {% if runtime == 'dsp' %}
-    op.set_padding({{ net.op[i].padding }});
-    {% if net.op[i].node_input | length > 0 %}
-    std::vector<int> input_node_ids({ {{ net.op[i].node_input | map(attribute='node_id') | join(', ') }} });
-    std::vector<int> input_output_ports({ {{ net.op[i].node_input | map(attribute='output_port') | join(', ')}} });
-
-    for (size_t i = 0; i < {{ net.op[i].node_input | length }}; ++i) {
-      mace::NodeInput input(input_node_ids[i], input_output_ports[i]);
-      op.add_node_input(input);
-    }
-    {% endif %}
-    {% if net.op[i].out_max_byte_size | length > 0 %}
-    std::vector<int> out_max_byte_sizes {{ net.op[i].out_max_byte_size | replace('[', '{') | replace(']', '}') }};
-    for (size_t i = 0; i < {{ net.op[i].out_max_byte_size | length }}; ++i) {
-      op.add_out_max_byte_size(out_max_byte_sizes[i]);
-    }
-    {% endif %}
-  {% endif %}
-
-}
-
-{% endfor %}
-
-}  // namespace {{tag}}
-}  // namespace mace
-
-{% else %}
-#include <vector>
-#include <string>
-#include "mace/core/public/mace.h"
-
-namespace mace {
-namespace {{tag}} {
-
-{% for tensor in tensors %}
-extern void Create{{ tensor.name }}(std::vector<mace::ConstTensor> &tensors);
-{% endfor %}
-
-
-{% for i in range(net.op|length) %}
-extern void CreateOperator{{i}}(mace::OperatorDef &op);
-{% endfor %}
-
-}  // namespace {{ tag }}
-}  // namespace mace
-
-
-namespace {
-
-{% if net.arg|length != 0 %}
-void CreateNetArg(mace::NetDef &net_def) {
-  net_def.mutable_arg().reserve({{ net.arg|length }});
-  mace::Argument *arg = nullptr;
-  {% for arg in net.arg %}
-
-  arg = net_def.add_arg();
-  arg->set_name({{ arg.name|tojson }});
-
-  {%- if arg.HasField('f') %}
-  arg->set_f({{ arg.f }});
-  {% endif %}
-
-  {%- if arg.HasField('i') %}
-  arg->set_i({{ arg.i }});
-  {% endif %}
-
-  {%- if arg.HasField('s') %}
-  arg->set_s({{ arg.s|tojson }});
-  {% endif %}
-
-  {% if arg.floats|length != 0 %}
-  arg->set_floats({ {{ arg.floats|join(', ') }} });
-  {% endif %}
-  {% if arg.ints|length != 0 %}
-  arg->set_ints({ {{ arg.ints|join(', ') }} });
-  {% endif %}
-  {% if arg.strings|length != 0 %}
-  arg->set_strings({ {{ arg.strings|stringfy() }} });
-  {% endif %}
-
-  {% endfor %}
-}
-{% endif %}
-
-{% if net.output_info | length > 0 %}
-void CreateOutputInfo(mace::NetDef &net_def) {
-	std::vector<std::vector<int>> dims { {{net.output_info | map(attribute='dims') | join(', ') | replace('[', '{') | replace(']', '}') }} };
-
-  std::vector<int> data_types_int { {{ net.output_info | map(attribute='data_type') | join(', ') }} };
-  std::vector<mace::DataType> data_types({{ net.output_info | length }});
-  for (int k = 0; k < {{ net.output_info | length }}; ++k) {
-    data_types[k] = static_cast<mace::DataType>(data_types_int[k]);
-  }
-  net_def.mutable_output_info().resize({{ net.output_info | length }});
-  for (int i = 0; i < {{ net.output_info | length }}; ++i) {
-    net_def.mutable_output_info()[i].set_data_type(data_types[i]);
-    net_def.mutable_output_info()[i].set_dims(dims[i]);
-  }
-}
-{% endif %}
-
-void CreateOperators(std::vector<mace::OperatorDef> &ops) {
-  ops.resize({{ net.op|length }});
-  {% for i in range(net.op|length) %}
-
-  mace::{{tag}}::CreateOperator{{i}}(ops[{{i}}]);
-  {% endfor %}
-
-}
-
-void CreateTensors(std::vector<mace::ConstTensor> &tensors) {
-  tensors.reserve({{ net.tensors|length }});
-
-  {% for tensor in net.tensors %}
-
-  mace::{{tag}}::Create{{tensor.name}}(tensors);
-  {% endfor %}
-
-}
-
-
-{% if net.mem_arena.mem_block|length != 0 %}
-void CreateMemoryArena(mace::MemoryArena &mem_arena) {
-  std::vector<mace::MemoryBlock> &mem_block = mem_arena.mutable_mem_block();
-  mem_block.reserve({{ net.mem_arena.mem_block|length }});
-
-  {% for mem_blk in net.mem_arena.mem_block %}
-  mem_block.emplace_back(mace::MemoryBlock({{ mem_blk.mem_id }},
-                                           {{mem_blk.x}},
-                                           {{mem_blk.y}}));
-  {% endfor %}
-
-}
-{% endif %}
-
-}
-
-namespace mace {
-namespace {{tag}} {
-
-NetDef CreateNet() {
-  NetDef net_def;
-  net_def.set_name("{{ net.name}}");
-  net_def.set_version("{{ net.version }}");
-
-  {% if net.arg|length != 0 %}
-  CreateNetArg(net_def);
-  {% endif %}
-
-  CreateOperators(net_def.mutable_op());
-
-  CreateTensors(net_def.mutable_tensors());
-
-  {% if net.mem_arena.mem_block|length != 0 %}
-  CreateMemoryArena(net_def.mutable_mem_arena());
-  {% endif %}
-
-  {% if net.output_info | length > 0 %}
-  CreateOutputInfo(net_def);
-  {% endif %}
-
-  return net_def;
-}
-
-}  // namespace {{tag}}
-}  // namespace mace
-{% endif %}
--- a/mace/python/tools/opencl_codegen.py
+++ b/mace/python/tools/opencl_codegen.py
-import argparse
-import os
-import sys
-
-import numpy as np
-
-import jinja2
-
-# python mace/python/tools/opencl_codegen.py \
-#     --cl_binary_dir=${CL_BIN_DIR} --output_path=${CL_HEADER_PATH}
-
-FLAGS = None
-
-
-def generate_cpp_source():
-  maps = {}
-  for file_name in os.listdir(FLAGS.cl_binary_dir):
-    file_path = os.path.join(FLAGS.cl_binary_dir, file_name)
-    if file_path[-4:] == ".bin":
-      # read binary
-      f = open(file_path, "rb")
-      binary_array = np.fromfile(f, dtype=np.uint8)
-      f.close()
-
-      maps[file_name[:-4]] = []
-      for ele in binary_array:
-        maps[file_name[:-4]].append(hex(ele))
-
-  env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
-  return env.get_template('str2vec_maps.cc.tmpl').render(
-    maps = maps,
-    data_type = 'unsigned char',
-    variable_name = 'kCompiledProgramMap'
-  )
-
-
-def main(unused_args):
-  if not os.path.exists(FLAGS.cl_binary_dir):
-    print("Input cl_binary_dir " + FLAGS.cl_binary_dir + " doesn't exist!")
-
-  cpp_cl_binary_source = generate_cpp_source()
-  if os.path.isfile(FLAGS.output_path):
-    os.remove(FLAGS.output_path)
-  w_file = open(FLAGS.output_path, "w")
-  w_file.write(cpp_cl_binary_source)
-  w_file.close()
-
-
-def parse_args():
-  """Parses command line arguments."""
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      "--cl_binary_dir",
-      type=str,
-      default="./cl_bin/",
-      help="The cl binaries directory.")
-  parser.add_argument(
-      "--output_path",
-      type=str,
-      default="./mace/examples/codegen/opencl/opencl_compiled_program.cc",
-      help="The path of generated C++ header file which contains cl binaries.")
-  return parser.parse_known_args()
-
-
-if __name__ == '__main__':
-  FLAGS, unparsed = parse_args()
-  main(unused_args=[sys.argv[0]] + unparsed)
--- a/mace/python/tools/source_converter_lib.py
+++ b/mace/python/tools/source_converter_lib.py
-import struct
-import os
-import uuid
-import numpy as np
-
-from tensorflow import gfile
-from mace.proto import mace_pb2
-from jinja2 import Environment, FileSystemLoader
-
-
-GENERATED_NAME = set()
-
-def generate_random_name():
-  name = '_' + uuid.uuid4().hex[:7].upper()
-  while name in GENERATED_NAME:
-    name = '_' + uuid.uuid4().hex[:7].upper()
-  GENERATED_NAME.add(name)
-  return name
-
-def generate_tensor_map(tensors):
-  tensor_map = {}
-  for t in tensors:
-    if not tensor_map.has_key(t.name):
-      tensor_map[t.name] = generate_random_name()
-  return tensor_map
-
-def generate_in_out_map(ops, tensor_map):
-  in_out_map = {}
-  for op in ops:
-    op.name = generate_random_name()
-    for input_name in op.input:
-        if not in_out_map.has_key(input_name):
-          if tensor_map.has_key(input_name):
-            in_out_map[input_name] = tensor_map[input_name]
-          else:
-            in_out_map[input_name] = generate_random_name()
-    for output_name in op.output:
-      if not in_out_map.has_key(output_name):
-        if tensor_map.has_key(output_name):
-          in_out_map[output_name] = tensor_map[output_name]
-        else:
-          in_out_map[output_name] = generate_random_name()
-  return in_out_map
-
-def obfuscate_name(net_def):
-  input_node = "mace_input_node"
-  output_node = "mace_output_node"
-  tensor_map = generate_tensor_map(net_def.tensors)
-  in_out_map = generate_in_out_map(net_def.op, tensor_map)
-  for t in net_def.tensors:
-    if input_node not in t.name and output_node not in t.name:
-      t.name = tensor_map[t.name]
-  for op in net_def.op:
-    for i in range(len(op.input)):
-      if input_node not in op.input[i]:
-        op.input[i] = in_out_map[op.input[i]]
-    for i in range(len(op.output)):
-      if output_node not in op.output[i]:
-        op.output[i] = in_out_map[op.output[i]]
-
-def rename_tensor(net_def):
-  tensor_map = {}
-  for t in net_def.tensors:
-    if not tensor_map.has_key(t.name):
-      tensor_map[t.name] = "_" + t.name[:-2].replace("/", "_")
-      t.name = tensor_map[t.name]
-  for op in net_def.op:
-    for i in range(len(op.input)):
-      if tensor_map.has_key(op.input[i]):
-        op.input[i] = tensor_map[op.input[i]]
-    for i in range(len(op.output)):
-      if tensor_map.has_key(op.output[i]):
-        op.output[i] = tensor_map[op.output[i]]
-
-class TensorInfo:
-  def __init__(self, t):
-    self.name = t.name
-    self.data_type = mace_pb2.DataType.Name(t.data_type)
-    if t.data_type == mace_pb2.DT_FLOAT:
-      self.data = bytearray(struct.pack('%sf' % len(t.float_data), *t.float_data))
-    elif t.data_type == mace_pb2.DT_INT32:
-      self.data = bytearray(struct.pack('%si' % len(t.int32_data), *t.int32_data))
-    elif t.data_type == mace_pb2.DT_UINT8:
-      self.data = bytearray(np.array(t.int32_data).astype(np.uint8).tolist())
-
-def stringfy(value):
-  return ', '.join('"{0}"'.format(w) for w in value)
-
-def convert_to_source(net_def, template, obfuscate, model_tag, output, runtime):
-  if obfuscate:
-    obfuscate_name(net_def)
-  else:
-    rename_tensor(net_def)
-
-  # Capture our current directory
-  template_dir = os.path.dirname(template)
-  template_name = os.path.basename(template)
-  print template_dir
-
-  # Create the jinja2 environment.
-  j2_env = Environment(loader=FileSystemLoader(template_dir),
-    trim_blocks=True)
-  j2_env.filters['stringfy'] = stringfy
-  counter = 0
-  output_dir = os.path.dirname(output) + '/'
-  # generate tensor source files
-  for t in net_def.tensors:
-    source = j2_env.get_template(template_name).render(
-      tensor_info = TensorInfo(t),
-      tensor = t,
-      tag = model_tag,
-      mode = 0,
-      runtime = runtime,
-    )
-    with gfile.GFile(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f:
-      f.write(source)
-    counter += 1
-
-  # generate op source files
-  counter = 0
-  op_size = len(net_def.op)
-  for start in range(0, op_size, 10):
-    source = j2_env.get_template(template_name).render(
-      start = start,
-      end = min(start+10, op_size),
-      net = net_def,
-      tag = model_tag,
-      mode = 1,
-      runtime = runtime,
-    )
-    with gfile.GFile(output_dir + 'op' + str(counter) + '.cc', "wb") as f:
-      f.write(source)
-    counter += 1
-
-  # generate model source files
-  tensors = [TensorInfo(t) for t in net_def.tensors]
-  source = j2_env.get_template(template_name).render(
-    tensors = tensors,
-    net = net_def,
-    tag = model_tag,
-    mode = 2,
-    runtime = runtime,
-  )
-  with gfile.GFile(output, "wb") as f:
-    f.write(source)
--- a/mace/python/tools/str2vec_maps.cc.tmpl
+++ b/mace/python/tools/str2vec_maps.cc.tmpl
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-// This is a generated file, DO NOT EDIT
-
-#include <map>
-#include <string>
-#include <vector>
-
-namespace mace {
-
-extern const std::map<std::string, std::vector<{{data_type}}>> {{variable_name}}=
-{
-  {% for key, value in maps.iteritems() %}
-  {
-    "{{key}}",
-    {
-      {%- for ele in value -%}
-      {{ele}},
-      {%- endfor -%}
-    }
-  },  // {{key}}
-{% endfor %}
-};
-
-}  // namespace
--- a/mace/python/tools/tf_converter.py
+++ b/mace/python/tools/tf_converter.py
-import argparse
-import sys
-import tensorflow as tf
-from tensorflow import gfile
-from mace.proto import mace_pb2
-from mace.python.tools import tf_converter_lib
-from mace.python.tools import tf_dsp_converter_lib
-from mace.python.tools import source_converter_lib
-
-# ./bazel-bin/mace/python/tools/tf_converter --input quantized_test.pb --output quantized_test_dsp.pb --runtime dsp --input_dim input_node,1,28,28,3
-
-FLAGS = None
-
-def main(unused_args):
-  if not gfile.Exists(FLAGS.input):
-    print("Input graph file '" + FLAGS.input + "' does not exist!")
-    return -1
-
-  input_graph_def = tf.GraphDef()
-  with gfile.Open(FLAGS.input, "rb") as f:
-    data = f.read()
-    input_graph_def.ParseFromString(data)
-
-  if FLAGS.runtime == 'dsp':
-    output_graph_def = tf_dsp_converter_lib.convert_to_mace_pb(
-      input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.prequantize)
-  else:
-    output_graph_def = tf_converter_lib.convert_to_mace_pb(
-      input_graph_def, FLAGS.input_node, FLAGS.output_node, FLAGS.data_type, FLAGS.runtime)
-
-  if FLAGS.output_type == 'source':
-    source_converter_lib.convert_to_source(output_graph_def, FLAGS.template, FLAGS.obfuscate,
-      FLAGS.model_tag, FLAGS.output, FLAGS.runtime)
-  else:
-    with gfile.GFile(FLAGS.output, "wb") as f:
-      f.write(output_graph_def.SerializeToString())
-    with gfile.GFile(FLAGS.output + '_txt', "wb") as f:
-      # output_graph_def.ClearField('tensors')
-      f.write(str(output_graph_def))
-  print("Model conversion is completed.")
-
-def str2bool(v):
-  if v.lower() in ('yes', 'true', 't', 'y', '1'):
-    return True
-  elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-    return False
-  else:
-    raise argparse.ArgumentTypeError('Boolean value expected.')
-
-def parse_args():
-  """Parses command line arguments."""
-  parser = argparse.ArgumentParser()
-  parser.register("type", "bool", lambda v: v.lower() == "true")
-  parser.add_argument(
-    "--input",
-    type=str,
-    default="",
-    help="TensorFlow \'GraphDef\' file to load.")
-  parser.add_argument(
-    "--output",
-    type=str,
-    default="",
-    help="File to save the output graph to.")
-  parser.add_argument(
-    "--runtime",
-    type=str,
-    default="cpu",
-    help="Runtime: cpu/gpu/dsp")
-  parser.add_argument(
-    "--input_node",
-    type=str,
-    default="input_node",
-    help="e.g., input_node")
-  parser.add_argument(
-    "--output_node",
-    type=str,
-    default="softmax",
-    help="e.g., softmax")
-  parser.add_argument(
-    "--prequantize",
-    type=bool,
-    default=True,
-    help="e.g., True")
-  parser.add_argument(
-    "--data_type",
-    type=str,
-    default='DT_FLOAT',
-    help="e.g., DT_HALF/DT_FLOAT")
-  parser.add_argument(
-    "--output_type",
-    type=str,
-    default="pb",
-    help="output type: source/pb")
-  parser.add_argument(
-    "--template",
-    type=str,
-    default="",
-    help="template path")
-  parser.add_argument(
-    "--obfuscate",
-    type=str2bool,
-    nargs='?',
-    const=False,
-    default=False,
-    help="obfuscate model names")
-  parser.add_argument(
-    "--model_tag",
-    type=str,
-    default="",
-    help="model tag for generated function and namespace")
-  return parser.parse_known_args()
-
-
-if __name__ == '__main__':
-  FLAGS, unparsed = parse_args()
-  main(unused_args=[sys.argv[0]] + unparsed)
--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
--- a/mace/python/tools/tf_dsp_converter_lib.py
+++ b/mace/python/tools/tf_dsp_converter_lib.py
-from mace.proto import mace_pb2
-import tensorflow as tf
-from operator import mul
-from dsp_ops import DspOps
-from mace.python.tools import graph_util
-from mace.python.tools.convert_util import tf_dtype_2_mace_dtype
-
-# converter --input ../libcv/quantized_icnet.pb --output quantized_icnet_dsp.pb \
-# --runtime dsp --input_node input_node --output_node output_node
-
-padding_mode = {
-  'NA': 0,
-  'SAME': 1,
-  'VALID': 2,
-  'MIRROR_REFLECT': 3,
-  'MIRROR_SYMMETRIC': 4,
-  'SAME_CAFFE': 5
-}
-
-def get_tensor_name_from_op(op_name, port):
-  return op_name + ':' + str(port)
-
-def get_node_from_map(op_map, op_or_tensor_name):
-  op_name = op_or_tensor_name.split(':')[0]
-  return op_map[op_name]
-
-def get_op_and_port_from_tensor(tensor_name):
-  op, port = tensor_name.split(':')
-  port = int(port)
-  return op, port
-
-def max_elem_size(tensor):
-  if len(tensor.shape.as_list()) == 0:
-    return tensor.dtype.size
-  else:
-    return reduce(mul, tensor.shape.as_list()) * tensor.dtype.size
-
-def find_dtype(tensor_dtype):
-  if tensor_dtype == tf.float32:
-    return mace_pb2.DT_FLOAT
-  elif tensor_dtype == tf.uint8 or tensor_dtype == tf.quint8:
-    return mace_pb2.DT_UINT8
-  elif tensor_dtype == tf.int32 or tensor_dtype == tf.qint32:
-    return mace_pb2.DT_INT32
-  else:
-    raise Exception('Unsupported data type: ', tensor_dtype)
-
-def has_padding_and_strides(op):
-  return 'padding' in op.node_def.attr and 'strides' in op.node_def.attr
-
-def is_node_flatten_reshape(op):
-  return op.type == 'Reshape' and len(op.outputs[0].shape) == 1
-
-def get_input_tensor(op, index):
-  input_tensor = op.inputs[index]
-  if input_tensor.op.type == 'Reshape':
-    input_tensor = get_input_tensor(input_tensor.op, 0)
-  return input_tensor
-
-def add_shape_const_node(net_def, op, values, name):
-  print ('Add const node: ', op.name + '/' + name)
-  tensor = net_def.tensors.add()
-  node_name = op.name + '/' + name
-  tensor.name = node_name + ':0'
-  tensor.data_type =  mace_pb2.DT_INT32
-  tensor.dims.extend(values)
-  return tensor.name
-
-
-def convert_op_outputs(mace_op_def, tf_op):
-  mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype)
-                                  for output in tf_op.outputs])
-  output_shapes = []
-  for output in tf_op.outputs:
-    output_shape = mace_pb2.OutputShape()
-    output_shape.dims.extend(output.shape.as_list())
-    output_shapes.append(output_shape)
-  mace_op_def.output_shape.extend(output_shapes)
-
-
-def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
-  first_op = unresolved_ops[0]
-  print ('Op: ', first_op.name, first_op.type, first_op.outputs[0].shape)
-
-  if first_op.name in resolved_ops:
-    pass
-
-  elif first_op.type == 'Const':
-    print ('Add const node: ', first_op.name)
-    tf_tensor = first_op.outputs[0].eval()
-    tensor = net_def.tensors.add()
-    tensor.name = first_op.outputs[0].name
-    tensor.data_type = find_dtype(first_op.outputs[0].dtype)
-    shape = list(tf_tensor.shape)
-    if len(shape) > 0:
-      tensor.dims.extend(shape)
-    if first_op.outputs[0].dtype == tf.float32:
-      tensor.float_data.extend(tf_tensor.astype(float).flat)
-    elif first_op.outputs[0].dtype == tf.int32 or \
-            first_op.outputs[0].dtype == tf.int8 or \
-            first_op.outputs[0].dtype == tf.int16 or \
-            first_op.outputs[0].dtype == tf.quint8 or \
-            first_op.outputs[0].dtype == tf.quint16:
-      tensor.int32_data.extend(tf_tensor.astype(int).flat)
-
-  else:
-    op_def = net_def.op.add()
-    op_def.name = first_op.name
-    op_def.type = dsp_ops.map_nn_op(first_op.type)
-    op_def.padding = padding_mode['NA']
-
-    if len(first_op.outputs) > 0 and first_op.type == 'Dequantize' \
-        and len(first_op.outputs[0].consumers()) > 0 \
-        and (first_op.outputs[0].consumers()[0].type == 'SpaceToBatchND' \
-        or first_op.outputs[0].consumers()[0].type == 'BatchToSpaceND'):
-      input_tensor = first_op.inputs[0]
-      min_tensor = first_op.inputs[1]
-      max_tensor = first_op.inputs[2]
-      s2b_op = first_op.outputs[0].consumers()[0]
-      reshape_op = s2b_op.outputs[0].consumers()[0]
-      min_op = reshape_op.outputs[0].consumers()[0]
-      max_op = reshape_op.outputs[0].consumers()[1]
-      quantize_op = min_op.outputs[0].consumers()[0]
-      resolved_ops.add(s2b_op.name)
-      resolved_ops.add(reshape_op.name)
-      resolved_ops.add(min_op.name)
-      resolved_ops.add(max_op.name)
-      resolved_ops.add(quantize_op.name)
-
-      op_def.name = quantize_op.name
-      op_def.type = dsp_ops.map_nn_op('Quantized' + s2b_op.type)
-      op_def.input.append(input_tensor.name)
-      op_def.input.extend([t.name for t in s2b_op.inputs[1:]])
-      op_def.input.extend([min_tensor.name, max_tensor.name])
-      op_def.out_max_byte_size.extend([max_elem_size(out) for out in quantize_op.outputs])
-      convert_op_outputs(op_def, quantize_op)
-    elif has_padding_and_strides(first_op):
-      op_def.padding = padding_mode[first_op.get_attr('padding')]
-      op_def.input.extend([t.name for t in first_op.inputs])
-      if 'ksize' in first_op.node_def.attr:
-        ksize = first_op.get_attr('ksize')
-        ksize_tensor = add_shape_const_node(net_def, first_op, ksize, 'ksize')
-        op_def.input.extend([ksize_tensor])
-      strides = first_op.get_attr('strides')
-      strides_tensor = add_shape_const_node(net_def, first_op, strides, 'strides')
-      op_def.input.extend([strides_tensor])
-      op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
-      convert_op_outputs(op_def, first_op)
-    elif is_node_flatten_reshape(first_op):
-      op_def.type = 'Flatten'
-      op_def.input.extend([t.name for t in first_op.inputs])
-      op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
-      convert_op_outputs(op_def, first_op)
-    elif dsp_ops.has_op(first_op.type):
-      op_def.input.extend([t.name for t in first_op.inputs])
-      op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
-      convert_op_outputs(op_def, first_op)
-    else:
-      raise Exception('Unsupported op: ', first_op)
-
-    resolved_ops.add(first_op.name)
-
-  del unresolved_ops[0]
-
-def add_output_node(net_def, output_node):
-  op_def = net_def.op.add()
-  op_def.name = '__output__'
-  op_def.type = 'OUTPUT'
-  op_def.input.extend([get_tensor_name_from_op(output_node, 0)])
-
-def reverse_batch_to_space_and_biasadd(net_def):
-  tensor_map = {}
-  for tensor in net_def.tensors:
-    tensor_map[tensor.name] = tensor
-  op_map = {}
-  for op in net_def.op:
-    op_map[op.name] = op
-  consumers = {}
-  for op in net_def.op:
-    for ipt in op.input:
-      if ipt not in consumers:
-        consumers[ipt] = []
-      consumers[ipt].append(op)
-
-  new_ops = []
-  skip_ops = set()
-  visited_ops = set()
-
-  for op in net_def.op:
-    if op.name in visited_ops:
-      pass
-    # pattern: QConv -> RR -> R -> QB2S -> QBiasAdd -> RR -> R
-    success = False
-    if op.type == 'Requantize_32to8':
-      biasadd_requantize_op = op
-      biasadd_op = get_node_from_map(op_map, biasadd_requantize_op.input[0])
-      if biasadd_op.type == 'QuantizedBiasAdd_8p8to32':
-        b2s_op = get_node_from_map(op_map, biasadd_op.input[0])
-        if b2s_op.type == 'QuantizedBatchToSpaceND_8':
-          conv_requantize_op = get_node_from_map(op_map, b2s_op.input[0])
-          conv_op = get_node_from_map(op_map, conv_requantize_op.input[0])
-          if conv_op.type == 'QuantizedConv2d_8x8to32':
-            new_biasadd_op = mace_pb2.OperatorDef()
-            new_biasadd_op.CopyFrom(biasadd_op)
-            new_biasadd_op.input[0] = get_tensor_name_from_op(conv_requantize_op.name, 0)
-            new_biasadd_op.input[2] = get_tensor_name_from_op(conv_requantize_op.name, 1)
-            new_biasadd_op.input[3] = get_tensor_name_from_op(conv_requantize_op.name, 2)
-            new_biasadd_op.out_max_byte_size[0] = conv_requantize_op.out_max_byte_size[0] * 4
-
-            new_biasadd_requantize_op = mace_pb2.OperatorDef()
-            new_biasadd_requantize_op.CopyFrom(biasadd_requantize_op)
-            new_biasadd_requantize_op.out_max_byte_size[0] = new_biasadd_op.out_max_byte_size[0] / 4
-
-            new_b2s_op = mace_pb2.OperatorDef()
-            new_b2s_op.CopyFrom(b2s_op)
-            new_b2s_op.input[0] = get_tensor_name_from_op(biasadd_requantize_op.name, 0)
-            new_b2s_op.input[3] = get_tensor_name_from_op(biasadd_requantize_op.name, 1)
-            new_b2s_op.input[4] = get_tensor_name_from_op(biasadd_requantize_op.name, 2)
-
-            new_ops.extend([new_biasadd_op, new_biasadd_requantize_op, new_b2s_op])
-            skip_ops = skip_ops.union([biasadd_op.name, biasadd_requantize_op.name, b2s_op.name])
-            visited_ops.add(op.name)
-
-            follow_ops = consumers[get_tensor_name_from_op(biasadd_requantize_op.name, 0)]
-            for follow_op in follow_ops:
-              new_follow_op = mace_pb2.OperatorDef()
-              new_follow_op.CopyFrom(follow_op)
-              for i in xrange(len(follow_op.input)):
-                for k in xrange(3):
-                  if new_follow_op.input[i] == get_tensor_name_from_op(biasadd_requantize_op.name, k):
-                    new_follow_op.input[i] = get_tensor_name_from_op(b2s_op.name, k)
-              new_ops.append(new_follow_op)
-              skip_ops.add(follow_op.name)
-              visited_ops.add(follow_op.name)
-
-    visited_ops.add(op.name)
-
-  new_net_def = mace_pb2.NetDef()
-  new_net_def.tensors.extend(tensor_map.values())
-  new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
-  new_net_def.op.extend(new_ops)
-
-  return new_net_def
-
-def add_node_id(net_def):
-  node_id_counter = 0
-  node_id_map = {}
-  for tensor in net_def.tensors:
-    tensor.node_id = node_id_counter
-    node_id_counter += 1
-    tensor_op, port = get_op_and_port_from_tensor(tensor.name)
-    node_id_map[tensor_op] = tensor.node_id
-
-  for op in net_def.op:
-    op.node_id = node_id_counter
-    node_id_counter += 1
-    node_id_map[op.name] = op.node_id
-    for ipt in op.input:
-      op_name, port = get_op_and_port_from_tensor(ipt)
-      node_id = node_id_map[op_name]
-      node_input = op.node_input.add()
-      node_input.node_id = node_id
-      node_input.output_port = int(port)
-
-  return net_def
-
-def add_input_output_info(net_def, input_node, output_node, graph, dtype):
-  input_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(input_node, 0))
-  output_tensor = graph.get_tensor_by_name(get_tensor_name_from_op(output_node, 0))
-
-  input_info = net_def.input_info.add()
-  input_info.dims.extend(input_tensor.shape.as_list())
-  input_info.data_type = dtype
-  if dtype == mace_pb2.DT_UINT8:
-    for i in xrange(2):
-      input_info = net_def.input_info.add()
-      input_info.dims.extend([1,1,1,1])
-      input_info.data_type = mace_pb2.DT_FLOAT
-
-  output_info = net_def.output_info.add()
-  output_info.dims.extend(output_tensor.shape.as_list())
-  output_info.data_type = dtype
-  if dtype == mace_pb2.DT_UINT8:
-    for i in xrange(2):
-      output_info = net_def.output_info.add()
-      output_info.dims.extend([1,1,1,1])
-      output_info.data_type = mace_pb2.DT_FLOAT
-
-  return net_def
-
-def strip_input_quantize_and_output_dequantize(net_def, input_node, output_node):
-  tensor_map = {}
-  for tensor in net_def.tensors:
-    tensor_map[tensor.name] = tensor
-  op_map = {}
-  for op in net_def.op:
-    op_map[op.name] = op
-  consumers = {}
-  for op in net_def.op:
-    for ipt in op.input:
-      if ipt not in consumers:
-        consumers[ipt] = []
-      consumers[ipt].append(op)
-
-  skip_ops = set()
-  new_ops = []
-  skip_tensors = set()
-
-  # INPUT->Flatten->Minf, Maxf->Quantize
-  for op in net_def.op:
-    if op.type == 'INPUT':
-      input_op = op
-      flatten_op = None
-      quantize_op = None
-      for o in consumers[get_tensor_name_from_op(input_op.name, 0)]:
-        if o.type == 'Flatten':
-          flatten_op = o
-        elif o.type == 'Quantize':
-          quantize_op = o
-      if quantize_op is not None:
-        minf_op, maxf_op = consumers[get_tensor_name_from_op(flatten_op.name, 0)]
-        skip_ops = skip_ops.union([input_op.name, flatten_op.name, minf_op.name, maxf_op.name, quantize_op.name])
-        skip_tensors = skip_tensors.union([flatten_op.input[1], minf_op.input[1], maxf_op.input[1]])
-
-        new_input_op = mace_pb2.OperatorDef()
-        new_input_op.name = input_op.name
-        new_input_op.type = input_op.type
-        new_input_op.padding = input_op.padding
-        new_input_op.out_max_byte_size.extend([input_op.out_max_byte_size[0]/4, 4, 4])
-        new_ops.append(new_input_op)
-        new_input_op.output_shape.extend([input_op.output_shape[0],
-                                          minf_op.output_shape[0],
-                                          maxf_op.output_shape[0]])
-        new_input_op.output_type.extend([input_op.output_type[0], mace_pb2.DT_FLOAT, mace_pb2.DT_FLOAT])
-        for follow_op in consumers[get_tensor_name_from_op(quantize_op.name, 0)]:
-          new_follow_op = mace_pb2.OperatorDef()
-          new_follow_op.CopyFrom(follow_op)
-          for i in xrange(len(follow_op.input)):
-            for k in xrange(3):
-              if new_follow_op.input[i] == get_tensor_name_from_op(quantize_op.name, k):
-                new_follow_op.input[i] = get_tensor_name_from_op(input_op.name, k)
-          new_ops.append(new_follow_op)
-          skip_ops.add(follow_op.name)
-
-    elif op.type == 'OUTPUT':
-      output_op = op
-      dequantize_op = get_node_from_map(op_map, output_op.input[0])
-      if dequantize_op.type == 'Dequantize':
-        skip_ops = skip_ops.union([dequantize_op.name, output_op.name])
-
-        new_output_op = mace_pb2.OperatorDef()
-        new_output_op.name = output_op.name
-        new_output_op.type = output_op.type
-        new_output_op.input.extend(dequantize_op.input)
-        new_ops.append(new_output_op)
-
-
-
-  new_net_def = mace_pb2.NetDef()
-  new_net_def.tensors.extend([tensor for tensor in net_def.tensors if tensor.name not in skip_tensors])
-  new_net_def.op.extend([op for op in net_def.op if op.name not in skip_ops])
-  new_net_def.op.extend(new_ops)
-  return new_net_def
-
-def convert_to_mace_pb(input_graph_def, input_node, output_node, prequantize=False):
-  """
-    nnlib does not have batch norm, so use tensorflow optimizer to fold
-     batch norm with convolution. The fold optimization reorders ops, so
-     we sort ops first by topology.
-  """
-  input_graph_def = graph_util.sort_tf_graph(input_graph_def)
-  net_def = mace_pb2.NetDef()
-
-  with tf.Session() as session:
-    with session.graph.as_default() as graph:
-      tf.import_graph_def(input_graph_def, name="")
-      ops = graph.get_operations()
-      dsp_ops = DspOps()
-      resolved_ops = set()
-      # convert const node
-      unresolved_ops = [op for op in ops if op.type == 'Const']
-      while len(unresolved_ops) > 0:
-        convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops)
-
-      # convert op node
-      unresolved_ops = [op for op in ops if op.type != 'Const']
-      while len(unresolved_ops) > 0:
-        convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops)
-
-      add_output_node(net_def, output_node)
-      # optimized_net_def = reverse_batch_to_space_and_biasadd(net_def)
-
-      if prequantize:
-        print('Prequantize ...')
-        net_def = strip_input_quantize_and_output_dequantize(net_def, input_node, output_node)
-
-      sorted_net_def = graph_util.sort_mace_graph(net_def, '__output__')
-      net_def_with_node_id = add_node_id(sorted_net_def)
-
-      if prequantize:
-        dtype = mace_pb2.DT_UINT8
-      else:
-        dtype = mace_pb2.DT_FLOAT
-      final_net_def = add_input_output_info(net_def_with_node_id, input_node, output_node, graph, dtype)
-
-  return final_net_def
-
--- a/tools/create_mace_lib.sh
+++ b/tools/create_mace_lib.sh
-#!/bin/bash
-set +x
-Usage() {
-  echo 'Usage: bash tools/create_mace_lib.sh tf_model_path image_size phone_version abi_version'
-}
-
-if [ $# -lt 4 ];then
-  Usage
-  exit -1
-fi
-
-IMAGE_SIZE=$2
-PHONE_VERSION=$3
-ABI_VERSION=$4
-MACE_STATIC_LIB_DIR=libmace_${PHONE_VERSION}_gcn${IMAGE_SIZE}_${ABI_VERSION}
-MACE_LIB_PATH=${MACE_STATIC_LIB_DIR}/lib/
-MACE_INCLUDE_PATH=${MACE_STATIC_LIB_DIR}/include/mace/core/public/
-
-rm -rf mace/codegen/models mace/codegen/opencl mace/codegen/opencl_bin mace/codegen/tuning mace/codegen/version
-rm -rf ${MACE_STATIC_LIB_DIR}
-mkdir -p ${MACE_LIB_PATH}
-mkdir -p ${MACE_INCLUDE_PATH}
-
-sh ./tools/validate_gcn.sh $1 $2
-cp bazel-bin/mace/**/*.a ${MACE_LIB_PATH}
-cp bazel-bin/mace/**/*.lo ${MACE_LIB_PATH}
-cp mace/core/public/*.h ${MACE_INCLUDE_PATH}
--- a/tools/export_lib.sh
+++ b/tools/export_lib.sh
+#!/bin/bash
+
+set -e
+
+Usage() {
+  echo "Usage: ./tools/export_lib.sh android_abi[armeabi-v7a/arm64-v8a] runtime[gpu/dsp] export_include_dir export_lib_dir"
+  echo "eg: ./tools/export_lib.sh armeabi-v7a ../include ../lib/libmace_v7"
+}
+
+if [ $# -lt 4 ]; then
+  Usage
+  exit -1
+fi
+
+# ANDROID_ABI=arm64-v8a
+# ANDROID_ABI=armeabi-v7a
+ANDROID_ABI=$1
+RUNTIME=$2
+EXPORT_INCLUDE_DIR=$3
+EXPORT_LIB_DIR=$4
+
+if [ x"${RUNTIME}" = x"dsp" ]; then
+  DSP_MODE_BUILD_FLAGS="--define hexagon=true"
+fi
+
+MACE_SOURCE_DIR=`/bin/pwd`
+CODEGEN_DIR=${MACE_SOURCE_DIR}/mace/codegen
+CL_CODEGEN_DIR=${CODEGEN_DIR}/opencl
+VERSION_CODEGEN_DIR=${CODEGEN_DIR}/version
+STRIP="--strip always"
+
+LIBMACE_NAME="libmace"
+LIBMACE_DEV_NAME="libmace_dev"
+LIBMACE_PROD_NAME="libmace_prod"
+
+libmace_targets=(
+  "//mace/ops:ops"
+  "//mace/kernels:kernels"
+  "//mace/codegen:generated_version"
+  "//mace/core:core"
+  "//mace/utils:logging"
+)
+
+libmace_dev_targets=(
+  "//mace/codegen:generated_opencl_dev"
+  "//mace/core:opencl_dev"
+  "//mace/utils:tuner_dev"
+)
+
+libmace_prod_targets=(
+  "//mace/core:opencl_prod"
+  "//mace/utils:tuner_prod"
+)
+
+all_targets=(${libmace_targets[*]} ${libmace_dev_targets[*]} ${libmace_prod_targets[*]})
+
+build_target()
+{
+  BAZEL_TARGET=$1
+  bazel build --verbose_failures -c opt --strip always $BAZEL_TARGET \
+    --crosstool_top=//external:android/crosstool \
+    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+     --cpu=$ANDROID_ABI \
+    --copt="-std=c++11" \
+    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
+    --copt="-Werror=return-type" \
+    --copt="-DMACE_OBFUSCATE_LITERALS" \
+    $TUNING_MODE_BUILD_FLAGS \
+    $DSP_MODE_BUILD_FLAGS || exit -1
+}
+
+merge_libs()
+{
+  CREATE_LIB_NAME=$1
+  LIBS_LIST=$2
+  echo "create /tmp/${CREATE_LIB_NAME}.a" > /tmp/${CREATE_LIB_NAME}.mri || exit -1
+
+  for lib_target in ${LIBS_LIST[*]}
+  do
+    lib_dir=`echo ${lib_target} | cut -d: -f1`
+    lib_dir=${lib_dir#//}
+    lib_name_prefix=lib`echo ${lib_target} | cut -d: -f2`
+    bin_path="${MACE_SOURCE_DIR}/bazel-bin/${lib_dir}/${lib_name_prefix}"
+    if [ -f "${bin_path}.a" ]; then
+      bin_path="${bin_path}.a"
+    else
+      bin_path="${bin_path}.lo"
+    fi
+    echo "addlib ${bin_path}" >> /tmp/${CREATE_LIB_NAME}.mri || exit -1
+  done
+
+  echo "save" >> /tmp/${CREATE_LIB_NAME}.mri || exit -1
+  echo "end" >> /tmp/${CREATE_LIB_NAME}.mri || exit -1
+
+  $ANDROID_NDK_HOME/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-ar \
+    -M < /tmp/${CREATE_LIB_NAME}.mri || exit -1
+}
+
+
+echo "Step 1: Generate encrypted opencl source"
+python mace/python/tools/encrypt_opencl_codegen.py \
+    --cl_kernel_dir=./mace/kernels/opencl/cl/ \
+    --output_path=${CODEGEN_DIR}/opencl/opencl_encrypt_program.cc || exit -1
+
+
+echo "Step 2: Generate version source"
+rm -rf ${VERSION_CODEGEN_DIR}
+mkdir ${VERSION_CODEGEN_DIR}
+bash mace/tools/git/gen_version_source.sh ${CODEGEN_DIR}/version/version.cc || exit -1
+
+
+echo "Step 3: Build libmace targets"
+bazel clean
+for target in ${all_targets[*]}
+do
+  build_target ${target}
+done
+
+
+echo "Step 4: Create mri files and generate merged libs"
+merge_libs "libmace" "${libmace_targets[*]}"
+merge_libs "libmace_dev" "${libmace_dev_targets[*]}"
+merge_libs "libmace_prod" "${libmace_prod_targets[*]}"
+
+
+echo "Step 5: Export lib"
+rm -rf ${EXPORT_INCLUDE_DIR}
+mkdir -p ${EXPORT_INCLUDE_DIR}/mace/core/public
+rm -rf ${EXPORT_LIB_DIR}
+mkdir -p ${EXPORT_LIB_DIR}
+
+cp ${MACE_SOURCE_DIR}/mace/core/public/* ${EXPORT_INCLUDE_DIR}/mace/core/public || exit -1
+cp /tmp/libmace.a /tmp/libmace_dev.a /tmp/libmace_prod.a ${EXPORT_LIB_DIR}/ || exit -1
+
+echo "Done!"
--- a/tools/gcn.config
+++ b/tools/gcn.config
-TF_INPUT_NODE=input
-TF_OUTPUT_NODE=softmax/Reshape_1
-TF_OUTPUT_BR_NODE=GCN/br_result_2/fcn_br
\ No newline at end of file
--- a/tools/side_gcn.config
+++ b/tools/side_gcn.config
-TF_INPUT_NODE=input_node
-TF_OUTPUT_NODE=softmax/Reshape_1
\ No newline at end of file
--- a/tools/validate_gcn.sh
+++ b/tools/validate_gcn.sh
-#!/bin/bash
-# Must run at root dir of mace project.
-set +x
-Usage() {
-  echo 'Usage: bash tools/validate_gcn.sh tools/gcn.config tf_model_path model_tag image_size runtime[gpu/dsp] [tuning]'
-}
-
-if [ $# -lt 5 ];then
-  Usage
-  exit -1
-fi
-
-source $1
-
-TF_MODEL_FILE_PATH=$2
-MODEL_TAG=$3
-IMAGE_SIZE=$4
-RUNTIME=$5
-TUNING_OR_NOT=${6:-0}
-
-if [ x"$RUNTIME" = x"dsp" ]; then
-  DATA_TYPE="DT_UINT8"
-  DEVICE_TYPE="HEXAGON"
-  TF_OUTPUT_NODE=${TF_OUTPUT_BR_NODE}
-else
-  DATA_TYPE="DT_HALF"
-  DEVICE_TYPE="OPENCL"
-fi
-
-VLOG_LEVEL=0
-MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH})
-MACE_SOURCE_DIR=`/bin/pwd`
-INPUT_FILE_NAME='model_input'
-OUTPUT_FILE_NAME='gcn.out'
-OUTPUT_LIST_FILE='gcn.list'
-PHONE_DATA_DIR="/data/local/tmp/mace_gcn"
-KERNEL_DIR="${PHONE_DATA_DIR}/cl/"
-CODEGEN_DIR=${MACE_SOURCE_DIR}/mace/codegen
-MODEL_CODEGEN_DIR=${CODEGEN_DIR}/models/${MODEL_TAG}
-CL_CODEGEN_DIR=${CODEGEN_DIR}/opencl
-CL_BIN_DIR=${CODEGEN_DIR}/opencl_bin
-TUNING_CODEGEN_DIR=${CODEGEN_DIR}/tuning
-VERSION_SOURCE_PATH=${CODEGEN_DIR}/version
-
-build_and_run()
-{
-  PRODUCTION_MODE=$1
-  if [ "$PRODUCTION_MODE" = true ]; then
-    PRODUCTION_MODE_BUILD_FLAGS="--define production=true"
-  fi
-
-  if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != true ]];then
-    tuning_flag=1
-    round=0 # only warm up
-  else
-    tuning_flag=0
-    round=2
-  fi
-
-  bazel build --verbose_failures -c opt --strip always mace/examples:mace_run \
-    --crosstool_top=//external:android/crosstool \
-    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
-    --cpu=armeabi-v7a \
-    --copt="-std=c++11" \
-    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
-    --copt="-Werror=return-type" \
-    --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
-    --copt="-DMACE_OBFUSCATE_LITERALS" \
-    $PRODUCTION_MODE_BUILD_FLAGS \
-    --define hexagon=true || exit -1
-
-  adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit -1
-  if [ "$PRODUCTION_MODE" = false ]; then
-    adb shell "mkdir -p ${KERNEL_DIR}" || exit -1
-  fi
-  adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} || exit -1
-  adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} || exit -1
-  adb push mace/core/runtime/hexagon/libhexagon_controller.so ${PHONE_DATA_DIR} || exit -1
-
-  adb </dev/null shell \
-    LD_LIBRARY_PATH=${PHONE_DATA_DIR} \
-    MACE_TUNING=${tuning_flag} \
-    MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
-    MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
-    MACE_KERNEL_PATH=$KERNEL_DIR \
-    ${PHONE_DATA_DIR}/mace_run \
-    --input_shape="1,${IMAGE_SIZE},${IMAGE_SIZE},3"\
-    --output_shape="1,${IMAGE_SIZE},${IMAGE_SIZE},2"\
-    --input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \
-    --output_file=${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME} \
-    --device=${DEVICE_TYPE}   \
-    --round=$round || exit -1
-}
-
-echo "Step 1: Generate input data"
-rm -rf ${MODEL_DIR}/${INPUT_FILE_NAME}
-python tools/validate.py --generate_data true \
- --input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \
- --input_shape="${IMAGE_SIZE},${IMAGE_SIZE},3" || exit -1
-
-echo "Step 2: Convert tf model to mace model and optimize memory"
-bazel build //mace/python/tools:tf_converter || exit -1
-rm -rf ${MODEL_CODEGEN_DIR}
-mkdir -p ${MODEL_CODEGEN_DIR}
-bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
-                                         --output=${MODEL_CODEGEN_DIR}/model.cc \
-                                         --input_node=${TF_INPUT_NODE} \
-                                         --output_node=${TF_OUTPUT_NODE} \
-                                         --data_type=${DATA_TYPE} \
-                                         --runtime=${RUNTIME} \
-                                         --output_type=source \
-                                         --template=${MACE_SOURCE_DIR}/mace/python/tools/model.template \
-                                         --model_tag=${MODEL_TAG} \
-                                         --obfuscate=True || exit -1
-
-echo "Step 3: Generate version source"
-rm -rf ${VERSION_SOURCE_PATH}
-mkdir -p ${VERSION_SOURCE_PATH}
-bash mace/tools/git/gen_version_source.sh ${VERSION_SOURCE_PATH}/version.cc
-
-echo "Step 4: Generate encrypted opencl source"
-rm -rf ${CL_CODEGEN_DIR}
-mkdir -p ${CL_CODEGEN_DIR}
-python mace/python/tools/encrypt_opencl_codegen.py \
-  --cl_kernel_dir=./mace/kernels/opencl/cl/ --output_path=${CL_CODEGEN_DIR}/opencl_encrypt_program.cc
-
-echo "Step 5: Run model on the phone with files"
-build_and_run false
-
-echo "Step 6: Generate OpenCL binary program and config code"
-rm -rf ${CL_BIN_DIR}
-mkdir -p ${CL_BIN_DIR}
-adb pull ${KERNEL_DIR} ${CL_BIN_DIR}
-python mace/python/tools/opencl_codegen.py \
-  --cl_binary_dir=${CL_BIN_DIR} --output_path=${CL_CODEGEN_DIR}/opencl_compiled_program.cc
-
-echo "Step 7: Generate tuning source file"
-adb pull ${PHONE_DATA_DIR}/mace_run.config ${CL_BIN_DIR}
-rm -rf ${TUNING_CODEGEN_DIR}
-mkdir -p ${TUNING_CODEGEN_DIR}
-python mace/python/tools/binary_codegen.py \
-  --binary_file=${CL_BIN_DIR}/mace_run.config --output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc
-
-echo "Step 8: Run model on the phone using binary"
-build_and_run true
-
-echo "Step 9: Pull the mace run result."
-rm -rf ${MODEL_DIR}/${OUTPUT_FILE_NAME}
-adb </dev/null pull ${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME} ${MODEL_DIR}
-
-echo "Step 10: Validate the result"
-python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \
-    --input_file ${MODEL_DIR}/${INPUT_FILE_NAME} \
-    --mace_out_file ${MODEL_DIR}/${OUTPUT_FILE_NAME} \
-    --input_node ${TF_INPUT_NODE} \
-    --output_node ${TF_OUTPUT_NODE} \
-    --input_shape "${IMAGE_SIZE},${IMAGE_SIZE},3" \
-    --output_shape "1,${IMAGE_SIZE},${IMAGE_SIZE},2"