Refactor mace_tools and yaml format for better usage.

ced4a49d · liuqi · fe0cdf27 · ced4a49d · ced4a49d · ced4a49d
30 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -68,4 +68,5 @@ platform_compitable_tests:
  stage: platform_compitable_tests
  script:
    - mkdir -p mace/codegen/version && bash mace/tools/git/gen_version_source.sh mace/codegen/version/version.cc
+    - mkdir -p mace/codegen/tuning && python mace/python/tools/binary_codegen.py --output_path=mace/codegen/tuning/tuning_params.cc
    - bazel build mace/core:core
--- a/docs/getting_started/create_a_model_deployment.rst
+++ b/docs/getting_started/create_a_model_deployment.rst
@@ -46,14 +46,14 @@ Configurations
      - The SHA256 checksum of the model file
    * - weight_sha256_checksum
      - The SHA256 checksum of the weight file, used by Caffe model
-    * - input_nodes
-      - The input node names, one or more strings
-    * - output_nodes
-      - The output node names, one or more strings
+    * - input_tensors
+      - The input tensor names (tensorflow), top name of inputs' layer (caffe). one or more strings
+    * - output_tensors
+      - The output tensor names (tensorflow), top name of outputs' layer (caffe). one or more strings
    * - input_shapes
-      - The shapes of the input nodes, in NHWC order
+      - The shapes of the input tensors, in NHWC order
    * - output_shapes
-      - The shapes of the output nodes, in NHWC order
+      - The shapes of the output tensors, in NHWC order
    * - runtime
      - The running device, one of CPU, GPU or DSP
    * - limit_opencl_kernel_time

--- a/docs/getting_started/how_to_build.rst
+++ b/docs/getting_started/how_to_build.rst
@@ -126,6 +126,7 @@ Tool <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/grap
            strip_unused_nodes(type=float, shape="1,64,64,3")
            remove_nodes(op=Identity, op=CheckNumerics)
            fold_constants(ignore_errors=true)
+            flatten_atrous_conv
            fold_batch_norms
            fold_old_batch_norms
            strip_unused_nodes
@@ -171,38 +172,110 @@ Caffe目前只支持最新版本，旧版本请使用Caffe的工具进行升级

 3.2 运行\ ``tools/mace_tools.py``\ 脚本

+**Commands**
+
+    **build**
+
+        .. note::
+
+            build模型静态库以及测试工具。
+
+        * *--config* (type=str,  default="",  required)：模型配置yaml文件路径.
+        * *--tuning* (optional)：是否为特定SOC调制GPU参数.
+        * *--enable_openmp* (optional)：是否启用openmp.
+
+    **run**
+
+        .. note::
+
+            命令行运行模型
+
+        * *--config* (type=str,  default="",  required)：模型配置yaml文件路径.
+        * *--round* (type=int, default=1,  optional)：模型运行次数。
+        * *--validate* (optional): 是否需要验证运行结果与框架运行结果是否一致。
+        * *--caffe_env* (type=local/docker, default=docker,  optional)：当vaildate时，可以选择指定caffe环境,local表示本地，docker表示使用docker容器.
+        * *--restart_round* (type=int, default=1,  optional)：模型重启次数。
+        * *--check_gpu_out_of_memory* (optional): 是否需要检查gpu内存越界。
+        * *--vlog_level* (type=int[0-5], default=0,  optional)：详细日志级别.
+
+        .. warning::
+
+            run依赖于build命令.build完成以后才可以执行run命令
+
+    **benchmark**
+        * *--config* (type=str,  default="",  required)：模型配置yaml文件路径.
+
+        .. warning::
+
+            benchmark依赖于build命令.
+
+    **通用参数**
+
+    .. list-table::
+        :widths: auto
+        :header-rows: 1
+        :align: left
+
+        * - argument(key)
+          - argument(value)
+          - default
+          - required
+          - commands
+          - explanation
+        * - --omp_num_threads
+          - int
+          - -1
+          - N
+          - run/benchmark
+          - number of threads
+        * - --cpu_affinity_policy
+          - int
+          - 1
+          - N
+          - run/benchmark
+          - 0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY
+        * - --gpu_perf_hint
+          - int
+          - 3
+          - N
+          - run/benchmark
+          - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH
+        * - --gpu_perf_hint
+          - int
+          - 3
+          - N
+          - run/benchmark
+          - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH
+        * - --gpu_priority_hint
+          - int
+          - 3
+          - N
+          - run/benchmark
+          - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH
+
 .. code:: sh

    # print help message
-    # python tools/mace_tools.py --help
-    # --config 配置文件的路径
-    # --output_dir 编译结果的输出文件目录，默认为`./build`
-    # --round 调用`examples/mace_run`运行模型的次数，默认为`1`
-    # --tuning 对opencl的参数调参，该项通常只有开发人员用到，默认为`true`
-    # --mode 运行模式，包含build/run/validate/merge/all/benchmark，默认为`all`
+    python tools/mace_tools.py -h
+    python tools/mace_tools.py build -h
+    python tools/mace_tools.py run -h
+    python tools/mace_tools.py benchmark -h

    # 仅编译模型和生成静态库
-    python tools/mace_tools.py --config=models/config.yaml --mode=build
+    python tools/mace_tools.py build --config=models/config.yaml

    # 测试模型的运行时间
-    python tools/mace_tools.py --config=models/config.yaml --mode=run --round=1000
+    python tools/mace_tools.py run --config=models/config.yaml --round=100

    # 对比编译好的模型在mace上与直接使用tensorflow或者caffe运行的结果，相似度使用`余弦距离表示`
    # 其中使用OpenCL设备，默认相似度大于等于`0.995`为通过；DSP设备下，相似度需要达到`0.930`。
-    python tools/mace_tools.py --config=models/config.yaml --mode=run --round=1000
-
-    # 将已编译好的多个模型合并成静态库
-    # 比如编译了8个模型，决定使用其中2个模型，这时候可以不重新build，直接修改全局配置文件，合并生成静态库
-    python tools/mace_tools.py --config=models/config.yaml --mode=merge
-
-    # 运行以上所有项（可用于测试速度，建议 round=20）
-    python tools/mace_tools.py --config=models/config.yaml --mode=all --round=1000
+    python tools/mace_tools.py run --config=models/config.yaml --validate

    # 模型Benchmark：查看每个Op的运行时间
-    python tools/mace_tools.py --config=models/config.yaml --mode=benchmark
+    python tools/mace_tools.py benchmark --config=models/config.yaml

    # 查看模型运行时占用内存（如果有多个模型，可能需要注释掉一部分配置，只剩一个模型的配置）
-    python tools/mace_tools.py --config=models/config.yaml --mode=run --round=10000 &
+    python tools/mace_tools.py run --config=models/config.yaml --round=10000 &
    adb shell dumpsys meminfo | grep mace_run
    sleep 10
    kill %1
@@ -211,21 +284,34 @@ Caffe目前只支持最新版本，旧版本请使用Caffe的工具进行升级

 通过前面的步骤，我们得到了包含业务模型的库文件。在业务代码中，我们只需要引入下面3组文件（\ ``./build/``\ 是默认的编译结果输出目录）：

-头文件(包含mace.h和各个模型的头文件)： \*
-``./build/${project_name}/${target_abi}/include/mace/public/*.h``
+**头文件**
+    * ``./build/${library_name}/include/mace/public/*.h``
+
+**静态库**
+    * ``./build/${library_name}/library/${target_abi}/*.a``
+
+**动态库**
+    * ``./build/${library_name}/library/${target_abi}/libhexagon_controller.so``
+
+    .. note::
+
+        仅编译的模型中包含dsp模式时用到
+
+**模型文件**
+    * ``./build/${library_name}/model/${MODEL_TAG}.pb``
+    * ``./build/${library_name}/model/${MODEL_TAG}.data``
+
+    .. note::

-静态库（包含mace engine、opencl和模型相关库）： \*
-``./build/${project_name}/${target_abi}/*.a``
+        pb文件紧当模型build_type设置为proto时才会产生。

-动态库（仅编译的模型中包含dsp模式时用到）： \*
-``./build/${project_name}/${target_abi}/libhexagon_controller.so``

-模型数据文件（仅在EMBED\_MODEL\_DATA=0时产生）： \*
-``./build/${project_name}/data/${MODEL_TAG}.data``
+**库文件tar包**
+    * ``./build/${library_name}/libmace_${library_name}.tar.gz``

-编译过程中间文件： \* ``./build/${project_name}/build/``
+    .. note::

-库文件tar包： \* ``./build/${project_name}/${project_name}.tar.gz``
+        该文件包含了上述所有文件，可以发布使用。

 5. 使用


--- a/docs/getting_started/models/demo_app_models.yaml
+++ b/docs/getting_started/models/demo_app_models.yaml
-# 配置文件名会被用作生成库的名称：libmace-${filename}.a
+# 库的名字
+library_name: library_name
+# 配置文件名会被用作生成库的名称：libmace-${library_name}.a
 target_abis: [armeabi-v7a, arm64-v8a]
 # 具体机型的soc编号，可以使用`adb shell getprop | grep ro.board.platform | cut -d [ -f3 | cut -d ] -f1`获取
 target_socs: [msm8998]
 embed_model_data: 1
+build_type: code # 模型build类型。code表示将模型转为代码，proto表示将模型转为protobuf文件
 models: # 一个配置文件可以包含多个模型的配置信息，最终生成的库中包含多个模型
-  first_net: # 模型的标签，在调度模型的时候，会用这个变量
+  first_net: # 模型的标签，在调度模型的时候，会用这个变量，必须唯一
    platform: tensorflow
    model_file_path: path/to/model64.pb # also support http:// and https://
    model_sha256_checksum: 7f7462333406e7dea87222737590ebb7d94490194d2f21a7d72bafa87e64e9f9
-    input_nodes: input_node
-    output_nodes: output_node
-    input_shapes: 1,64,64,3
-    output_shapes: 1,64,64,2
+    subgraphs:
+      - input_tensors: input_node
+        input_shapes: 1,64,64,3
+        output_tensors: output_node
+        output_shapes: 1,64,64,2
    runtime: gpu
+    data_type: fp16_fp32
    limit_opencl_kernel_time: 0
-    dsp_mode: 0
+    nnlib_graph_mode: 0
    obfuscate: 1
-    fast_conv: 0
+    winograd: 0
    input_files:
      - path/to/input_files # support http://
  second_net:
@@ -25,22 +30,23 @@ models: # 一个配置文件可以包含多个模型的配置信息，最终生
    weight_file_path: path/to/weight.caffemodel
    model_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
    weight_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
-    input_nodes:
-      - input_node0
-      - input_node1
-    output_nodes:
-      - output_node0
-      - output_node1
-    input_shapes:
-      - 1,256,256,3
-      - 1,128,128,3
-    output_shapes:
-      - 1,256,256,2
-      - 1,1,1,2
+    subgraphs:
+      - input_tensors:
+          - input_node0
+          - input_node1
+        input_shapes:
+          - 1,256,256,3
+          - 1,128,128,3
+        output_tensors:
+          - output_node0
+          - output_node1
+        output_shapes:
+          - 1,256,256,2
+          - 1,1,1,2
    runtime: cpu
    limit_opencl_kernel_time: 1
-    dsp_mode: 0
+    nnlib_graph_mode: 0
    obfuscate: 1
-    fast_conv: 0
+    winograd: 0
    input_files:
      - path/to/input_files # support http://
--- a/mace/benchmark/BUILD
+++ b/mace/benchmark/BUILD
@@ -2,8 +2,6 @@
 # Examples
 load(
    "//mace:mace.bzl",
-    "if_production_mode",
-    "if_not_production_mode",
    "if_hexagon_enabled",
    "if_openmp_enabled",
    "if_android",

--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -12,8 +12,6 @@ load(
    "if_android",
    "if_hexagon_enabled",
    "if_not_hexagon_enabled",
-    "if_production_mode",
-    "if_not_production_mode",
    "if_openmp_enabled",
    "if_neon_enabled",
 )
@@ -63,17 +61,13 @@ cc_library(
    ]),
    deps = [
        "//mace/codegen:generated_version",
+        "//mace/codegen:generated_tuning_params",
        "//mace/proto:mace_cc",
        "//mace/utils",
    ] + if_android([
        ":opencl_headers",
        "//mace/codegen:generated_opencl",
        "@half//:half",
-    ]) + if_production_mode([
-        "//mace/codegen:generated_tuning_params",
-        "//mace/utils:utils_prod",
-    ]) + if_not_production_mode([
-        "//mace/utils:utils_dev",
    ]) + if_hexagon_enabled([
        "//third_party/nnlib:libhexagon",
    ]),

--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -284,16 +284,16 @@ const unsigned char *LoadModelData(const std::string &model_data_file,
                                   const size_t &data_size) {
  int fd = open(model_data_file.c_str(), O_RDONLY);
  MACE_CHECK(fd >= 0, "Failed to open model data file ",
-             model_data_file, ", error code: ", errno);
+             model_data_file, ", error code: ", strerror(errno));

  const unsigned char *model_data = static_cast<const unsigned char *>(
      mmap(nullptr, data_size, PROT_READ, MAP_PRIVATE, fd, 0));
  MACE_CHECK(model_data != MAP_FAILED, "Failed to map model data file ",
-             model_data_file, ", error code: ", errno);
+             model_data_file, ", error code: ", strerror(errno));

  int ret = close(fd);
  MACE_CHECK(ret == 0, "Failed to close model data file ",
-             model_data_file, ", error code: ", errno);
+             model_data_file, ", error code: ", strerror(errno));

  return model_data;
 }
@@ -302,7 +302,8 @@ void UnloadModelData(const unsigned char *model_data,
                     const size_t &data_size) {
  int ret = munmap(const_cast<unsigned char *>(model_data),
                   data_size);
-  MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ", errno);
+  MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ",
+             strerror(errno));
 }

 MaceStatus CreateMaceEngineFromProto(

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -215,6 +215,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
      (*kernel_error)->UnMap();
    }
    if (runtime->is_profiling_enabled()) {
+      event.wait();
      CallStats tmp_stats;
      runtime->GetCallStats(event, &tmp_stats);
      call_stats.start_micros =
@@ -223,8 +224,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
    }
  }
  if (future != nullptr) {
-    future->wait_fn = [runtime, event, call_stats](CallStats *stats) {
-      event.wait();
+    future->wait_fn = [runtime, call_stats](CallStats *stats) {
      if (stats != nullptr) {
        stats->start_micros = call_stats.start_micros;
        stats->end_micros = stats->start_micros + call_stats.end_micros;

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -209,8 +209,9 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
 std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
                                       const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  uint64_t cache_size =
+      OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
  lws[2] =
      std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);

--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -115,6 +115,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
      kernel_error_->UnMap();
    }
    if (runtime->is_profiling_enabled()) {
+      event.wait();
      CallStats tmp_stats;
      runtime->GetCallStats(event, &tmp_stats);
      call_stats.start_micros =
@@ -123,8 +124,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
    }
  }
  if (future != nullptr) {
-    future->wait_fn = [runtime, event, call_stats](CallStats *stats) {
-      event.wait();
+    future->wait_fn = [runtime, call_stats](CallStats *stats) {
      if (stats != nullptr) {
        stats->start_micros = call_stats.start_micros;
        stats->end_micros = stats->start_micros + call_stats.end_micros;

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -24,18 +24,6 @@ def if_android_arm64(a):
      "//conditions:default": [],
  })

-def if_production_mode(a):
-  return select({
-      "//mace:production_mode": a,
-      "//conditions:default": [],
-  })
-
-def if_not_production_mode(a):
-  return select({
-      "//mace:production_mode": [],
-      "//conditions:default": a,
-  })
-
 def if_neon_enabled(a):
  return select({
      "//mace:neon_enabled": a,

--- a/mace/python/tools/BUILD
+++ b/mace/python/tools/BUILD
@@ -3,7 +3,6 @@ py_library(
    srcs = [
        "convert_util.py",
        "graph_util.py",
-        "tensor_util.py",
        "tf_dsp_converter_lib.py",
        "converter_tool/base_converter.py",
        "converter_tool/shape_inference.py",
@@ -20,9 +19,9 @@ py_library(
 )

 py_library(
-    name = "source_converter_lib",
+    name = "model_saver_lib",
    srcs = [
-        "source_converter_lib.py",
+        "model_saver.py",
    ],
    srcs_version = "PY2AND3",
    deps = [
@@ -45,7 +44,7 @@ py_binary(
    srcs_version = "PY2AND3",
    deps = [
        ":converter_lib",
-        ":source_converter_lib",
+        ":model_saver_lib",
        "@six_archive//:six",
    ],
 )
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -21,8 +21,7 @@ import copy
 from mace.proto import mace_pb2
 from mace.python.tools import tf_dsp_converter_lib
 from mace.python.tools import memory_optimizer
-from mace.python.tools import source_converter_lib
-from mace.python.tools import tensor_util
+from mace.python.tools import model_saver
 from mace.python.tools.converter_tool import base_converter as cvt
 from mace.python.tools.converter_tool import tensorflow_converter
 from mace.python.tools.converter_tool import caffe_converter
@@ -42,6 +41,20 @@ device_type_map = {'cpu': cvt.DeviceType.CPU.value,
                   'dsp': cvt.DeviceType.HEXAGON.value}


+def parse_data_type(data_type, device_type):
+    if device_type == cvt.DeviceType.GPU.value:
+        if data_type == 'fp32_fp32':
+            return mace_pb2.DT_FLOAT
+        else:
+            return mace_pb2.DT_HALF
+    elif device_type == cvt.DeviceType.CPU.value:
+        return mace_pb2.DT_FLOAT
+    elif device_type == cvt.DeviceType.HEXAGON.value:
+        return mace_pb2.DT_UINT8
+    else:
+        print("Invalid device type: " + device_type)
+
+
 def file_checksum(fname):
    hash_func = hashlib.sha256()
    with open(fname, "rb") as f:
@@ -82,7 +95,7 @@ def main(unused_args):
    if FLAGS.platform not in ['tensorflow', 'caffe']:
        print ("platform %s is not supported." % FLAGS.platform)
        sys.exit(-1)
-    if FLAGS.runtime not in ['cpu', 'gpu', 'dsp', '']:
+    if FLAGS.runtime not in ['cpu', 'gpu', 'dsp', 'cpu+gpu']:
        print ("runtime %s is not supported." % FLAGS.runtime)
        sys.exit(-1)

@@ -114,7 +127,6 @@ def main(unused_args):
            output_node.name = output_node_names[i]
            option.add_output_node(output_node)

-        print("Convert model to mace model.")
        if FLAGS.platform == 'tensorflow':
            converter = tensorflow_converter.TensorflowConverter(
                option, FLAGS.model_file)
@@ -122,24 +134,18 @@ def main(unused_args):
            converter = caffe_converter.CaffeConverter(option,
                                                       FLAGS.model_file,
                                                       FLAGS.weight_file)
+        else:
+            print("Mace do not support platorm %s yet." & FLAGS.platform)
+            exit(1)

        output_graph_def = converter.run()

-        if FLAGS.gpu_data_type == 'half':
-            gpu_data_type = mace_pb2.DT_HALF
-        else:
-            gpu_data_type = mace_pb2.DT_FLOAT
-        device_data_type_map = {
-            cvt.DeviceType.CPU.value: mace_pb2.DT_FLOAT,
-            cvt.DeviceType.GPU.value: gpu_data_type,
-            cvt.DeviceType.HEXAGON.value: mace_pb2.DT_UINT8
-        }
-
        print("Transform model to one that can better run on device")
-        if not FLAGS.runtime:
+        if FLAGS.runtime == 'cpu+gpu':
            cpu_graph_def = copy.deepcopy(output_graph_def)
            option.device = cvt.DeviceType.CPU.value
-            option.data_type = device_data_type_map[cvt.DeviceType.CPU.value]
+            option.data_type = parse_data_type(
+                FLAGS.data_type, cvt.DeviceType.CPU.value)
            option.disable_transpose_filters()
            mace_cpu_transformer = transformer.Transformer(
                option, cpu_graph_def)
@@ -149,7 +155,8 @@ def main(unused_args):
            print "CPU memory optimization done."

            option.device = cvt.DeviceType.GPU.value
-            option.data_type = device_data_type_map[cvt.DeviceType.GPU.value]
+            option.data_type = parse_data_type(
+                FLAGS.data_type, cvt.DeviceType.GPU.value)
            option.enable_transpose_filters()
            mace_gpu_transformer = transformer.Transformer(
                option, output_graph_def)
@@ -165,7 +172,8 @@ def main(unused_args):
            print "Merge done"
        else:
            option.device = device_type_map[FLAGS.runtime]
-            option.data_type = device_data_type_map[option.device]
+            option.data_type = parse_data_type(
+                FLAGS.data_type, option.device)
            mace_transformer = transformer.Transformer(
                option, output_graph_def)
            output_graph_def = mace_transformer.run()
@@ -180,36 +188,13 @@ def main(unused_args):

            print "Memory optimization done."

-    if FLAGS.obfuscate:
-        tensor_util.obfuscate_name(output_graph_def)
-    else:
-        tensor_util.rename_tensor(output_graph_def)
-
-    tensor_infos, model_data = tensor_util.get_tensor_info_and_model_data(
-            output_graph_def, FLAGS.runtime, FLAGS.gpu_data_type)
-
-    source_converter_lib.convert_to_source(
-            output_graph_def, model_checksum, weight_checksum, FLAGS.template,
-            FLAGS.obfuscate, FLAGS.model_tag, FLAGS.codegen_output,
-            FLAGS.runtime, FLAGS.embed_model_data, FLAGS.winograd,
-            FLAGS.model_load_type, tensor_infos, model_data)
-
-    if not FLAGS.embed_model_data:
-        output_dir = os.path.dirname(FLAGS.codegen_output) + '/'
-        with open(output_dir + FLAGS.model_tag + '.data', "wb") as f:
-            f.write(bytearray(model_data))
-
-    if FLAGS.model_load_type == 'pb':
-        tensor_util.del_tensor_data(
-                output_graph_def, FLAGS.runtime, FLAGS.gpu_data_type)
-        tensor_util.update_tensor_data_type(
-                output_graph_def, FLAGS.runtime, FLAGS.gpu_data_type)
-        with open(FLAGS.pb_output, "wb") as f:
-            f.write(output_graph_def.SerializeToString())
-        # with open(FLAGS.pb_output + '_txt', "wb") as f:
-        #     # output_graph_def.ClearField('tensors')
-        #     f.write(str(output_graph_def))
-    print("Model conversion is completed.")
+    model_saver.save_model(
+        output_graph_def, model_checksum, weight_checksum,
+        FLAGS.template_dir, FLAGS.obfuscate, FLAGS.model_tag,
+        FLAGS.output_dir, FLAGS.runtime,
+        FLAGS.embed_model_data,
+        FLAGS.winograd, FLAGS.data_type,
+        FLAGS.model_build_type)


 def str2bool(v):
@@ -244,15 +229,10 @@ def parse_args():
        default="",
        help="Weight file sha256 checksum")
    parser.add_argument(
-        "--codegen_output",
+        "--output_dir",
        type=str,
        default="",
        help="File to save the output graph to.")
-    parser.add_argument(
-        "--pb_output",
-        type=str,
-        default="",
-        help="File to save the mace model to.")
    parser.add_argument(
        "--runtime", type=str, default="", help="Runtime: cpu/gpu/dsp")
    parser.add_argument(
@@ -263,7 +243,7 @@ def parse_args():
    parser.add_argument(
        "--output_node", type=str, default="softmax", help="e.g., softmax")
    parser.add_argument(
-        "--template", type=str, default="", help="template path")
+        "--template_dir", type=str, default="", help="template path")
    parser.add_argument(
        "--obfuscate",
        type=str2bool,
@@ -295,13 +275,16 @@ def parse_args():
        default=True,
        help="embed model data.")
    parser.add_argument(
-        "--model_load_type",
+        "--model_build_type",
        type=str,
-        default="source",
-        help="[source|pb] Load models in generated `source` code" +
-                "or `pb` file.")
+        default="code",
+        help="[proto|code] build models to code" +
+                "or `Protobuf` file.")
    parser.add_argument(
-        "--gpu_data_type", type=str, default="half", help="half/float")
+        "--data_type",
+        type=str,
+        default="fp16_fp32",
+        help="fp16_fp32/fp32_fp32")
    return parser.parse_known_args()



--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -395,12 +395,6 @@ class TensorflowConverter(base_converter.ConverterInterface):
        align_corners_arg.i = tf_op.get_attr(tf_align_corners)

    def convert_space_batch(self, tf_op):
-        print """You might want to try 'flatten_atrous_conv' in
-         transform graph to turn atrous conv2d into regular conv2d.
-         This may give you performance benefit on GPU.
-         (see https://github.com/tensorflow/tensorflow/blob/master/
-         tensorflow/tools/graph_transforms/README.md#flatten_atrous_conv)
-         """

        op = self.convert_general_op(tf_op)
        del op.input[1:]

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -52,7 +52,7 @@ class Transformer(base_converter.ConverterInterface):
    """

    def __init__(self, option, model):
-        # DO NOT reorder the following transformers
+        # DO NOT reorder the following transformers' order
        self._registered_transformers_order = [
            TransformerRule.REMOVE_USELESS_RESHAPE_OP,
            TransformerRule.REMOVE_IDENTITY_OP,
@@ -940,8 +940,9 @@ class Transformer(base_converter.ConverterInterface):
            op_def.type = MaceKeyword.mace_image_to_buffer
            op_def.input.extend([output_node.name])
            op_def.output.extend([output_name])
-            output_shape = op_def.output_shape.add()
-            output_shape.dims.extend(output_node.shape)
+            if output_node.shape:
+                output_shape = op_def.output_shape.add()
+                output_shape.dims.extend(output_node.shape)

            arg = op_def.arg.add()
            arg.name = MaceKeyword.mace_buffer_type

--- a/mace/python/tools/encrypt_opencl_codegen.py
+++ b/mace/python/tools/encrypt_opencl_codegen.py
@@ -73,8 +73,6 @@ def encrypt_opencl_codegen(cl_kernel_dir, output_path):
    with open(output_path, "w") as w_file:
        w_file.write(cpp_cl_encrypted_kernel)

-    print("Generate encrypted opencl source done!")
-

 def parse_args():
    """Parses command line arguments."""

--- a/mace/python/tools/mace_engine_factory.h.jinja2
+++ b/mace/python/tools/mace_engine_factory.h.jinja2
@@ -25,7 +25,7 @@

 namespace mace {

-{% if model_type == 'source' %}
+{% if model_type == 'code' %}
 {% for tag in model_tags %}
 namespace {{tag}} {


--- a/mace/python/tools/mace_engine_factory_codegen.py
+++ b/mace/python/tools/mace_engine_factory_codegen.py
@@ -25,7 +25,6 @@ def gen_mace_engine_factory(model_tags, template_dir, model_type, output_dir):
    j2_env = Environment(
        loader=FileSystemLoader(template_dir), trim_blocks=True)
    # generate mace_run BUILD file
-    print model_tags
    template_name = 'mace_engine_factory.h.jinja2'
    source = j2_env.get_template(template_name).render(
        model_tags=model_tags,

--- a/mace/python/tools/tensor_util.py
+++ b/mace/python/tools/tensor_util.py
@@ -12,13 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import hashlib
+import datetime
+import os
+import uuid
 import numpy as np
+import hashlib
+from enum import Enum

 from mace.proto import mace_pb2
+from jinja2 import Environment, FileSystemLoader

 GENERATED_NAME = set()

+GPUDataTypeStrs = [
+    "fp16_fp32",
+    "fp32_fp32",
+]
+
+GPUDataType = \
+    Enum('GPUDataType', [(ele, ele) for ele in GPUDataTypeStrs], type=str)
+

 def generate_obfuscated_name(namespace, name):
    md5 = hashlib.md5()
@@ -104,70 +117,199 @@ def rename_tensor(net_def):
                op.output[i] = tensor_map[op.output[i]]


+def stringfy(value):
+    return ', '.join('"{0}"'.format(w) for w in value)
+
+
 class TensorInfo:
-    def __init__(self, id, t, runtime, gpu_data_type):
+    def __init__(self, id, tensor):
        self.id = id
-        self.data_type = mace_pb2.DataType.Name(t.data_type)
-        if t.data_type == mace_pb2.DT_FLOAT:
-            if runtime == 'gpu' and gpu_data_type == 'half':
-                self.data_type = mace_pb2.DT_HALF
-                self.data = bytearray(
-                    np.array(t.float_data).astype(np.float16).tobytes())
-            else:
-                self.data_type = mace_pb2.DT_FLOAT
-                self.data = bytearray(
-                    np.array(t.float_data).astype(np.float32).tobytes())
-        elif t.data_type == mace_pb2.DT_INT32:
+        self.data_type = tensor.data_type
+        if tensor.data_type == mace_pb2.DT_HALF:
+            self.data_type = mace_pb2.DT_HALF
            self.data = bytearray(
-                np.array(t.int32_data).astype(np.int32).tobytes())
-        elif t.data_type == mace_pb2.DT_UINT8:
+                np.array(tensor.float_data).astype(np.float16).tobytes())
+        elif tensor.data_type == mace_pb2.DT_FLOAT:
+            self.data_type = mace_pb2.DT_FLOAT
            self.data = bytearray(
-                np.array(t.int32_data).astype(np.uint8).tolist())
+                np.array(tensor.float_data).astype(np.float32).tobytes())
+        elif tensor.data_type == mace_pb2.DT_INT32:
+            self.data = bytearray(
+                np.array(tensor.int32_data).astype(np.int32).tobytes())
+        elif tensor.data_type == mace_pb2.DT_UINT8:
+            self.data = bytearray(
+                np.array(tensor.int32_data).astype(np.uint8).tolist())
        else:
-            raise Exception('Tensor data type %s not supported' % t.data_type)
+            raise Exception('Tensor data type %s not supported' %
+                            tensor.data_type)


-def get_tensor_info_and_model_data(net_def, runtime, gpu_data_type):
-    model_data = []
+def update_tensor_infos(net_def, runtime, data_type):
    offset = 0
    counter = 0
    tensor_infos = []
-    for t in net_def.tensors:
-        tensor_info = TensorInfo(counter, t, runtime, gpu_data_type)
+    for tensor in net_def.tensors:
+        # update data_type
+        if tensor.data_type == mace_pb2.DT_FLOAT and runtime == 'gpu' \
+                and data_type == GPUDataType.fp16_fp32:
+            tensor.data_type = mace_pb2.DT_HALF
+
+        # Add offset and data_size
+        tensor_info = TensorInfo(counter, tensor)
        tensor_infos.append(tensor_info)
        # align
        if tensor_info.data_type != 'DT_UINT8' and offset % 4 != 0:
            padding = 4 - offset % 4
-            model_data.extend(bytearray([0] * padding))
            offset += padding

-        if t.data_type == mace_pb2.DT_FLOAT:
-            t.data_size = len(t.float_data)
-        elif t.data_type == mace_pb2.DT_INT32:
-            t.data_size = len(t.int32_data)
-        elif t.data_type == mace_pb2.DT_UINT8:
-            t.data_size = len(t.int32_data)
-        t.offset = offset
-
+        if tensor.data_type == mace_pb2.DT_FLOAT \
+                or tensor.data_type == mace_pb2.DT_HALF:
+            tensor.data_size = len(tensor.float_data)
+        elif tensor.data_type == mace_pb2.DT_INT32:
+            tensor.data_size = len(tensor.int32_data)
+        elif tensor.data_type == mace_pb2.DT_UINT8:
+            tensor.data_size = len(tensor.int32_data)
+        tensor.offset = offset
+        offset += len(tensor_info.data)
        counter += 1
+
+
+def extract_model_data(net_def):
+    model_data = []
+    offset = 0
+    counter = 0
+    for tensor in net_def.tensors:
+        tensor_info = TensorInfo(counter, tensor)
+        # align
+        if tensor_info.data_type != mace_pb2.DT_UINT8 and offset % 4 != 0:
+            padding = 4 - offset % 4
+            model_data.extend(bytearray([0] * padding))
+            offset += padding
        model_data.extend(tensor_info.data)
        offset += len(tensor_info.data)
+        counter += 1
+    return model_data

-    return tensor_infos, model_data

+def save_model_data(net_def, model_tag, output_dir):
+    model_data = extract_model_data(net_def)
+    # generate tensor data
+    with open(output_dir + model_tag + '.data', "wb") as f:
+        f.write(bytearray(model_data))

-def del_tensor_data(net_def, runtime, gpu_data_type):
-    for t in net_def.tensors:
-        if t.data_type == mace_pb2.DT_FLOAT:
-            del t.float_data[:]
-        elif t.data_type == mace_pb2.DT_INT32:
-            del t.int32_data[:]
-        elif t.data_type == mace_pb2.DT_UINT8:
-            del t.int32_data[:]

+def save_model_to_proto(net_def, model_tag, output_dir):
+    for tensor in net_def.tensors:
+        if tensor.data_type == mace_pb2.DT_FLOAT \
+                or tensor.data_type == mace_pb2.DT_HALF:
+            del tensor.float_data[:]
+        elif tensor.data_type == mace_pb2.DT_INT32:
+            del tensor.int32_data[:]
+        elif tensor.data_type == mace_pb2.DT_UINT8:
+            del tensor.int32_data[:]
+    proto_file_path = output_dir + model_tag + '.pb'
+    with open(proto_file_path, "wb") as f:
+        f.write(net_def.SerializeToString())
+    with open(proto_file_path + '_txt', "wb") as f:
+        f.write(str(net_def))

-def update_tensor_data_type(net_def, runtime, gpu_data_type):
-    for t in net_def.tensors:
-        if t.data_type == mace_pb2.DT_FLOAT and runtime == 'gpu' \
-                and gpu_data_type == 'half':
-            t.data_type = mace_pb2.DT_HALF
+
+def save_model_to_code(net_def, model_tag, runtime,
+                       template_dir, output_dir, embed_model_data,
+                       model_checksum, weight_checksum,
+                       obfuscate, winograd_conv):
+    # Create the jinja2 environment.
+    j2_env = Environment(
+        loader=FileSystemLoader(template_dir), trim_blocks=True)
+    j2_env.filters['stringfy'] = stringfy
+
+    # generate tensor source files
+    template_name = 'tensor_source.jinja2'
+
+    counter = 0
+    for tensor in net_def.tensors:
+        tensor_info = TensorInfo(counter, tensor)
+        # convert tensor
+        source = j2_env.get_template(template_name).render(
+            tensor_info=tensor_info,
+            tensor=tensor,
+            tag=model_tag,
+        )
+        with open(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f:
+            f.write(source)
+        counter += 1
+
+    # generate tensor data
+    model_data = extract_model_data(net_def)
+    template_name = 'tensor_data.jinja2'
+    source = j2_env.get_template(template_name).render(
+        tag=model_tag,
+        embed_model_data=embed_model_data,
+        model_data_size=len(model_data),
+        model_data=model_data)
+    with open(output_dir + 'tensor_data' + '.cc', "wb") as f:
+        f.write(source)
+
+    # generate op source files
+    template_name = 'operator.jinja2'
+    counter = 0
+    op_size = len(net_def.op)
+    for start in range(0, op_size, 10):
+        source = j2_env.get_template(template_name).render(
+            start=start,
+            end=min(start + 10, op_size),
+            net=net_def,
+            tag=model_tag,
+            runtime=runtime,
+        )
+        with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f:
+            f.write(source)
+        counter += 1
+
+    # generate model source files
+    build_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    template_name = 'model.jinja2'
+    checksum = model_checksum
+    if weight_checksum is not None:
+        checksum = "{},{}".format(model_checksum, weight_checksum)
+    source = j2_env.get_template(template_name).render(
+        net=net_def,
+        tag=model_tag,
+        runtime=runtime,
+        obfuscate=obfuscate,
+        embed_model_data=embed_model_data,
+        winograd_conv=winograd_conv,
+        checksum=checksum,
+        build_time=build_time)
+    with open(output_dir + 'model.cc', "wb") as f:
+        f.write(source)
+
+    # generate model header file
+    template_name = 'model_header.jinja2'
+    source = j2_env.get_template(template_name).render(tag=model_tag, )
+    with open(output_dir + model_tag + '.h', "wb") as f:
+        f.write(source)
+
+
+def save_model(net_def, model_checksum, weight_checksum, template_dir,
+               obfuscate, model_tag, output_dir, runtime, embed_model_data,
+               winograd_conv, data_type, model_build_type):
+    if obfuscate:
+        obfuscate_name(net_def)
+    else:
+        rename_tensor(net_def)
+
+    output_dir = output_dir + '/'
+    # update tensor type
+    update_tensor_infos(net_def, runtime, data_type)
+
+    if model_build_type == 'proto' or not embed_model_data:
+        save_model_data(net_def, model_tag, output_dir)
+
+    if model_build_type == 'proto':
+        save_model_to_proto(net_def, model_tag, output_dir)
+    else:
+        save_model_to_code(net_def, model_tag, runtime,
+                           template_dir, output_dir, embed_model_data,
+                           model_checksum, weight_checksum,
+                           obfuscate, winograd_conv)
--- a/mace/python/tools/operator.jinja2
+++ b/mace/python/tools/operator.jinja2
@@ -94,10 +94,11 @@ void CreateOperator{{i}}(mace::OperatorDef *op) {

  {% endfor %}

+  {% if net.op[i].output_shape|length > 0 %}
  op->mutable_output_shape()->Reserve({{ net.op[i].output_shape|length }});
  mace::OutputShape * output_shape = nullptr;
  {% for shape in net.op[i].output_shape %}
-	{% if shape.dims|length > 0 %}
+  {% if shape.dims|length > 0 %}
  output_shape = op->add_output_shape();

  output_shape->mutable_dims()->Reserve({{ shape.dims|length }});
@@ -105,8 +106,9 @@ void CreateOperator{{i}}(mace::OperatorDef *op) {
  output_shape->add_dims({{ dim }});
  {% endfor %}

-	{% endif %}
+  {% endif %}
  {% endfor %}
+  {% endif %}

  std::vector<int> output_types_int({ {{ net.op[i].output_type | join(', ') }} });
  std::vector<mace::DataType> output_types({{ net.op[i].output_type | length }});

--- a/mace/python/tools/source_converter_lib.py
+++ b/mace/python/tools/source_converter_lib.py
-# Copyright 2018 Xiaomi, Inc.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import datetime
-import os
-
-from mace.proto import mace_pb2
-from jinja2 import Environment, FileSystemLoader
-
-
-def stringfy(value):
-    return ', '.join('"{0}"'.format(w) for w in value)
-
-
-def convert_to_source(net_def, model_checksum, weight_checksum, template_dir,
-                      obfuscate, model_tag, output, runtime, embed_model_data,
-                      winograd_conv, model_load_type, tensor_infos,
-                      model_data):
-    # Capture our current directory
-    print template_dir
-
-    # Create the jinja2 environment.
-    j2_env = Environment(
-        loader=FileSystemLoader(template_dir), trim_blocks=True)
-    j2_env.filters['stringfy'] = stringfy
-    output_dir = os.path.dirname(output) + '/'
-    # generate tensor source files
-    template_name = 'tensor_source.jinja2'
-    for i in range(len(net_def.tensors)):
-        if model_load_type == 'source':
-            source = j2_env.get_template(template_name).render(
-                tensor_info=tensor_infos[i],
-                tensor=net_def.tensors[i],
-                tag=model_tag,
-            )
-            with open(output_dir + 'tensor' + str(i) + '.cc', "wb") as f:
-                f.write(source)
-
-    if model_load_type == 'source':
-        # generate tensor data
-        template_name = 'tensor_data.jinja2'
-        source = j2_env.get_template(template_name).render(
-            tag=model_tag,
-            embed_model_data=embed_model_data,
-            model_data_size=len(model_data),
-            model_data=model_data)
-        with open(output_dir + 'tensor_data' + '.cc', "wb") as f:
-            f.write(source)
-
-        # generate op source files
-        template_name = 'operator.jinja2'
-        counter = 0
-        op_size = len(net_def.op)
-        for start in range(0, op_size, 10):
-            source = j2_env.get_template(template_name).render(
-                start=start,
-                end=min(start + 10, op_size),
-                net=net_def,
-                tag=model_tag,
-                runtime=runtime,
-            )
-            with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f:
-                f.write(source)
-            counter += 1
-
-        # generate model source files
-        build_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-        template_name = 'model.jinja2'
-        checksum = model_checksum
-        if weight_checksum is not None:
-            checksum = "{},{}".format(model_checksum, weight_checksum)
-        source = j2_env.get_template(template_name).render(
-            net=net_def,
-            tag=model_tag,
-            runtime=runtime,
-            obfuscate=obfuscate,
-            embed_model_data=embed_model_data,
-            winograd_conv=winograd_conv,
-            checksum=checksum,
-            build_time=build_time)
-        with open(output, "wb") as f:
-            f.write(source)
-
-        # generate model header file
-        template_name = 'model_header.jinja2'
-        source = j2_env.get_template(template_name).render(tag=model_tag, )
-        with open(output_dir + model_tag + '.h', "wb") as f:
-            f.write(source)
--- a/mace/utils/BUILD
+++ b/mace/utils/BUILD
@@ -31,28 +31,6 @@ cc_library(
    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
    deps = [
        "//mace/public",
-    ],
-)
-
-cc_library(
-    name = "utils_dev",
-    srcs = [
-        "tuner_development.cc",
-    ],
-    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
-    deps = [
-        ":utils",
-    ],
-)
-
-cc_library(
-    name = "utils_prod",
-    srcs = [
-        "tuner_production.cc",
-    ],
-    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
-    deps = [
-        ":utils",
        "//mace/codegen:generated_tuning_params",
    ],
 )
@@ -70,7 +48,7 @@ cc_test(
    ]),
    linkstatic = 1,
    deps = [
-        ":utils_dev",
+        ":utils",
        "@gtest//:gtest",
        "@gtest//:gtest_main",
    ],

--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -29,10 +29,6 @@

 namespace mace {

-extern bool GetTuningParams(
-    const char *path,
-    std::unordered_map<std::string, std::vector<unsigned int>> *param_table);
-
 template <typename param_type>
 class Tuner {
 public:
@@ -74,9 +70,6 @@ class Tuner {
                        : "");
        return func(param_table_[obfucated_param_key], nullptr, nullptr);
      } else {
-#ifndef MACE_DISABLE_NO_TUNING_WARNING
-        LOG(WARNING) << "Fallback to default parameter: " << param_key;
-#endif
        return func(default_param, nullptr, nullptr);
      }
    }
@@ -124,9 +117,16 @@ class Tuner {
  }

  inline void ReadRunParamters() {
-    bool success = GetTuningParams(path_, &param_table_);
-    if (!success) {
-      LOG(WARNING) << "Get run parameter failed.";
+    extern const std::map<std::string, std::vector<unsigned int>>
+        kTuningParamsData;
+    if (!kTuningParamsData.empty()) {
+      for (auto it = kTuningParamsData.begin(); it != kTuningParamsData.end();
+           ++it) {
+        param_table_.emplace(it->first, std::vector<unsigned int>(
+            it->second.begin(), it->second.end()));
+      }
+    } else {
+      LOG(INFO) << "There is no tuned parameters.";
    }
  }


--- a/tools/bazel.rc
+++ b/tools/bazel.rc
+# Partially borrowed from tensorflow tools/bazel.rc
+
+# By default, we don't distinct target and host platfroms.
+# When doing cross compilation, use --config=cross_compile to distinct them.
+build --distinct_host_configuration=false
+build:cross_compile --distinct_host_configuration=true
+
+build --verbose_failures
+build --copt=-std=c++11
+build --copt=-D_GLIBCXX_USE_C99_MATH_TR1
+build --copt=-DMACE_OBFUSCATE_LITERALS
+
+# Usage example: bazel build --config android
+build:android --crosstool_top=//external:android/crosstool
+build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:android --config=cross_compile
+
+# Usage example: bazel build --config optimization 
+build:optimization -c opt
+build:optimization --copt=-O3
+build:optimization --strip=always
+
+# Address sanitizer
+build:asan --strip=never
+build:asan --copt -fsanitize=address
+build:asan --copt -D_FORTIFY_SOURCE
+build:asan --copt -DADDRESS_SANITIZER
+build:asan --copt -O0
+build:asan --copt -g
+build:asan --copt -fno-omit-frame-pointer
+build:asan --linkopt -fsanitize=address
+
+# Thread sanitizer
+build:tsan --strip=never
+build:tsan --copt -fsanitize=thread
+build:tsan --copt -DTHREAD_SANITIZER
+build:tsan --copt -DDYNAMIC_ANNOTATIONS_ENABLED=1
+build:tsan --copt -DDYNAMIC_ANNOTATIONS_EXTERNAL_IMPL=1
+build:tsan --copt -O0
+build:tsan --copt -fno-omit-frame-pointer
+build:tsan --linkopt -fsanitize=thread
+
+# Memory sanitizer
+build:msan --strip=never
+build:msan --copt -fsanitize=memory
+build:msan --copt -DADDRESS_SANITIZER
+build:msan --copt -O0
+build:msan --copt -fno-omit-frame-pointer
+build:msan --linkopt -fsanitize=memory
+
+# Undefined Behavior Sanitizer
+build:ubsan --strip=never
+build:ubsan --copt -fsanitize=undefined
+build:ubsan --copt -O0
+build:ubsan --copt -fno-omit-frame-pointer
+build:ubsan --linkopt -fsanitize=undefined
+build:ubsan --linkopt -lubsan
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -95,21 +95,6 @@ def parse_args():
        type=str2bool,
        default=False,
        help="Whether to run the target")
-    parser.add_argument(
-        "--valgrind",
-        type=bool,
-        default=False,
-        help="Whether to use valgrind to check memory error.")
-    parser.add_argument(
-        "--valgrind_path",
-        type=str,
-        default="/data/local/tmp/valgrind",
-        help="Valgrind install path.")
-    parser.add_argument(
-        "--valgrind_args",
-        type=str,
-        default="",
-        help="Valgrind command args.")
    parser.add_argument("--args", type=str, default="", help="Command args")
    parser.add_argument(
        "--stdout_processor",
@@ -121,6 +106,10 @@ def parse_args():
        type=str2bool,
        default=True,
        help="Whether to use neon optimization")
+    parser.add_argument(
+        '--address_sanitizer',
+        action="store_true",
+        help="Whether to enable AddressSanitizer")
    return parser.parse_known_args()


@@ -145,16 +134,17 @@ def main(unused_args):
    sh_commands.gen_encrypted_opencl_source()
    sh_commands.gen_compiled_opencl_source()
    sh_commands.gen_mace_version()
+    sh_commands.gen_tuning_param_code([])

    strip = "always"
    debug = False
-    if FLAGS.valgrind:
+    if FLAGS.address_sanitizer:
        strip = "never"
        debug = True
    for target_abi in target_abis:
-        sh_commands.bazel_build(target, strip=strip, abi=target_abi,
-                                disable_no_tuning_warning=True, debug=debug,
-                                enable_neon=FLAGS.enable_neon)
+        sh_commands.bazel_build(target, abi=target_abi,
+                                enable_neon=FLAGS.enable_neon,
+                                address_sanitizer=FLAGS.address_sanitizer)
        if FLAGS.run_target:
            for serialno in target_devices:
                if target_abi not in set(
@@ -162,28 +152,17 @@ def main(unused_args):
                    print("Skip device %s which does not support ABI %s" %
                          (serialno, target_abi))
                    continue
-                if FLAGS.valgrind:
-                    stdouts = sh_commands.adb_run_valgrind(
-                        serialno,
-                        host_bin_path,
-                        bin_name,
-                        valgrind_path=FLAGS.valgrind_path,
-                        valgrind_args=FLAGS.valgrind_args,
-                        args=FLAGS.args,
-                        opencl_profiling=1,
-                        vlog_level=0,
-                        device_bin_path="/data/local/tmp/mace",
-                        out_of_range_check=1)
-                else:
-                    stdouts = sh_commands.adb_run(
-                        serialno,
-                        host_bin_path,
-                        bin_name,
-                        args=FLAGS.args,
-                        opencl_profiling=1,
-                        vlog_level=0,
-                        device_bin_path="/data/local/tmp/mace",
-                        out_of_range_check=1)
+                stdouts = sh_commands.adb_run(
+                    target_abi,
+                    serialno,
+                    host_bin_path,
+                    bin_name,
+                    args=FLAGS.args,
+                    opencl_profiling=1,
+                    vlog_level=0,
+                    device_bin_path="/data/local/tmp/mace",
+                    out_of_range_check=1,
+                    address_sanitizer=FLAGS.address_sanitizer)
                device_properties = sh_commands.adb_getprop_by_serialno(
                    serialno)
                globals()[FLAGS.stdout_processor](stdouts, device_properties,

--- a/tools/common.py
+++ b/tools/common.py
@@ -13,23 +13,99 @@
 # limitations under the License.

 import enum
-import logging
 import re


 ################################
 # log
 ################################
-def init_logging():
-    logger = logging.getLogger('MACE')
-    logger.setLevel(logging.INFO)
+class CMDColors:
+    PURPLE = '\033[95m'
+    BLUE = '\033[94m'
+    GREEN = '\033[92m'
+    YELLOW = '\033[93m'
+    RED = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'

-    ch = logging.StreamHandler()
-    ch.setLevel(logging.INFO)
-    formatter = logging.Formatter(
-        '%(asctime)s [%(name)s] [%(levelname)s]: %(message)s')
-    ch.setFormatter(formatter)
-    logger.addHandler(ch)
+
+class MaceLogger:
+    @staticmethod
+    def header(message):
+        print CMDColors.PURPLE + message + CMDColors.ENDC
+
+    @staticmethod
+    def summary(message):
+        print CMDColors.GREEN + message + CMDColors.ENDC
+
+    @staticmethod
+    def info(message):
+        print message
+
+    @staticmethod
+    def warning(message):
+        print CMDColors.YELLOW + 'WARNING:' + message + CMDColors.ENDC
+
+    @staticmethod
+    def error(module, message):
+        print CMDColors.RED + 'ERROR: [' + module + '] '\
+              + message + CMDColors.ENDC
+        exit(1)
+
+
+def mace_check(condition, module, message):
+    if not condition:
+        MaceLogger.error(module, message)
+
+
+################################
+# String Formatter
+################################
+class StringFormatter:
+    @staticmethod
+    def table(header, data, title, align="R"):
+        data_size = len(data)
+        column_size = len(header)
+        column_length = [len(str(ele)) + 1 for ele in header]
+        for row_idx in range(data_size):
+            data_tuple = data[row_idx]
+            ele_size = len(data_tuple)
+            assert(ele_size == column_size)
+            for i in range(ele_size):
+                column_length[i] = max(column_length[i],
+                                       len(str(data_tuple[i])) + 1)
+
+        table_column_length = sum(column_length) + column_size + 1
+        dash_line = '-' * table_column_length + '\n'
+        header_line = '=' * table_column_length + '\n'
+        output = ""
+        output += dash_line
+        output += str(title).center(table_column_length) + '\n'
+        output += dash_line
+        output += '|' + '|'.join([str(header[i]).center(column_length[i])
+                                  for i in range(column_size)]) + '|\n'
+        output += header_line
+
+        for data_tuple in data:
+            ele_size = len(data_tuple)
+            row_list = []
+            for i in range(ele_size):
+                if align == "R":
+                    row_list.append(str(data_tuple[i]).rjust(column_length[i]))
+                elif align == "L":
+                    row_list.append(str(data_tuple[i]).ljust(column_length[i]))
+                elif align == "C":
+                    row_list.append(str(data_tuple[i])
+                                    .center(column_length[i]))
+            output += '|' + '|'.join(row_list) + "|\n" + dash_line
+        return output
+
+    @staticmethod
+    def block(message):
+        line_length = 10 + len(str(message)) + 10
+        star_line = '*' * line_length + '\n'
+        return star_line + str(message).center(line_length) + '\n' + star_line


 ################################

--- a/tools/example.yaml
+++ b/tools/example.yaml
-# example.yaml
-# Each yaml file describes a exported library (could be named [target_abi]/libmace-${filename}.a), 
-# which can contains more than one models
-# target_soc can get by `adb shell getprop | grep ro.board.platform | cut -d [ -f3 | cut -d ] -f1`
-target_abis: [armeabi-v7a, arm64-v8a]
-target_socs: [MSM8953]
-embed_model_data: 1
-models:
-  preview_net:
-    platform: tensorflow
-    model_file_path: path/to/model64.pb # also support http:// and https://
-    model_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
-    input_nodes: input_node
-    output_nodes: output_node
-    input_shapes: 1,64,64,3
-    output_shapes: 1,64,64,2
-    runtime: gpu
-    limit_opencl_kernel_time: 0
-    dsp_mode: 0
-    obfuscate: 1
-    fast_conv: 0
-    validation_inputs_data:
-      - path/to/input_files
-  capture_net:
-    platform: caffe
-    model_file_path: path/to/model.prototxt
-    weight_file_path: path/to/weight.caffemodel
-    model_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
-    weight_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
-    input_nodes:
-      - input_node0
-      - input_node1
-    output_nodes:
-      - output_node0
-      - output_node1
-    input_shapes:
-      - 1,256,256,3
-      - 1,128,128,3
-    output_shapes:
-      - 1,256,256,2
-      - 1,1,1,2
-    runtime: cpu
-    limit_opencl_kernel_time: 1
-    dsp_mode: 0
-    obfuscate: 1
-    fast_conv: 0
--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -18,7 +18,6 @@
 #     --mode=all

 import argparse
-import enum
 import filelock
 import hashlib
 import os
@@ -28,48 +27,143 @@ import sys
 import urllib
 import yaml
 import re
+from enum import Enum

-import common
 import sh_commands
+from sh_commands import BuildType

-from ConfigParser import ConfigParser
+from common import CaffeEnvType
+from common import mace_check
+from common import MaceLogger
+from common import StringFormatter

-
-def get_target_socs(configs):
-    if "host" in configs["target_abis"]:
-        return [""]
-    else:
-        available_socs = sh_commands.adb_get_all_socs()
-        target_socs = available_socs
-        if "target_socs" in configs:
-            target_socs = set(configs["target_socs"])
-            target_socs = target_socs & available_socs
-
-        if FLAGS.target_socs != "all":
-            socs = set(FLAGS.target_socs.split(','))
-            target_socs = target_socs & socs
-            missing_socs = socs.difference(target_socs)
-            if len(missing_socs) > 0:
-                print(
-                    "Error: devices with SoCs are not connected %s" %
-                    missing_socs)
-                exit(1)
-
-        if not target_socs:
-            print("Error: no device to run")
-            exit(1)
-
-        return target_socs
+################################
+# common definitions
+################################
+BUILD_OUTPUT_DIR = 'build'
+PHONE_DATA_DIR = "/data/local/tmp/mace_run/"
+MODEL_OUTPUT_DIR_NAME = 'model'
+BUILD_TMP_DIR_NAME = '_tmp'
+BUILD_TMP_GENERAL_OUTPUT_DIR_NAME = 'general'
+OUTPUT_LIBRARY_DIR_NAME = 'library'
+
+ABITypeStrs = [
+    "armeabi-v7a",
+    "arm64-v8a",
+    "host",
+]
+ABIType = Enum('ABIType', [(ele, ele) for ele in ABITypeStrs], type=str)
+
+
+PlatformTypeStrs = [
+    "tensorflow",
+    "caffe",
+]
+PlatformType = Enum('PlatformType', [(ele, ele) for ele in PlatformTypeStrs],
+                    type=str)
+
+RuntimeTypeStrs = [
+    "cpu",
+    "gpu",
+    "dsp",
+    "cpu+gpu"
+]
+
+
+class RuntimeType(object):
+    cpu = 'cpu'
+    gpu = 'gpu'
+    dsp = 'dsp'
+    cpu_gpu = 'cpu+gpu'
+
+
+CPUDataTypeStrs = [
+    "fp32",
+]
+
+CPUDataType = Enum('CPUDataType', [(ele, ele) for ele in CPUDataTypeStrs],
+                   type=str)
+
+GPUDataTypeStrs = [
+    "fp16_fp32",
+    "fp32_fp32",
+]
+
+GPUDataType = Enum('GPUDataType', [(ele, ele) for ele in GPUDataTypeStrs],
+                   type=str)
+
+
+class DefaultValues(object):
+    omp_num_threads = -1,
+    cpu_affinity_policy = 1,
+    gpu_perf_hint = 3,
+    gpu_priority_hint = 3,
+
+
+class YAMLKeyword(object):
+    library_name = 'library_name'
+    target_abis = 'target_abis'
+    target_socs = 'target_socs'
+    build_type = 'build_type'
+    embed_model_data = 'embed_model_data'
+    models = 'models'
+    platform = 'platform'
+    model_file_path = 'model_file_path'
+    model_sha256_checksum = 'model_sha256_checksum'
+    weight_file_path = 'weight_file_path'
+    weight_sha256_checksum = 'weight_sha256_checksum'
+    subgraphs = 'subgraphs'
+    input_tensors = 'input_tensors'
+    input_shapes = 'input_shapes'
+    output_tensors = 'output_tensors'
+    output_shapes = 'output_shapes'
+    runtime = 'runtime'
+    data_type = 'data_type'
+    limit_opencl_kernel_time = 'limit_opencl_kernel_time'
+    nnlib_graph_mode = 'nnlib_graph_mode'
+    obfuscate = 'obfuscate'
+    winograd = 'winograd'
+    validation_inputs_data = 'validation_inputs_data'
+
+
+class ModuleName(object):
+    YAML_CONFIG = 'YAML CONFIG'
+    MODEL_CONVERTER = 'Model Converter'
+
+
+CPP_KEYWORDS = [
+    'alignas', 'alignof', 'and', 'and_eq', 'asm', 'atomic_cancel',
+    'atomic_commit', 'atomic_noexcept', 'auto', 'bitand', 'bitor',
+    'bool', 'break', 'case', 'catch', 'char', 'char16_t', 'char32_t',
+    'class', 'compl', 'concept', 'const', 'constexpr', 'const_cast',
+    'continue', 'co_await', 'co_return', 'co_yield', 'decltype', 'default',
+    'delete', 'do', 'double', 'dynamic_cast', 'else', 'enum', 'explicit',
+    'export', 'extern', 'false', 'float', 'for', 'friend', 'goto', 'if',
+    'import', 'inline', 'int', 'long', 'module', 'mutable', 'namespace',
+    'new', 'noexcept', 'not', 'not_eq', 'nullptr', 'operator', 'or', 'or_eq',
+    'private', 'protected', 'public', 'register', 'reinterpret_cast',
+    'requires', 'return', 'short', 'signed', 'sizeof', 'static',
+    'static_assert', 'static_cast', 'struct', 'switch', 'synchronized',
+    'template', 'this', 'thread_local', 'throw', 'true', 'try', 'typedef',
+    'typeid', 'typename', 'union', 'unsigned', 'using', 'virtual', 'void',
+    'volatile', 'wchar_t', 'while', 'xor', 'xor_eq', 'override', 'final',
+    'transaction_safe', 'transaction_safe_dynamic', 'if', 'elif', 'else',
+    'endif', 'defined', 'ifdef', 'ifndef', 'define', 'undef', 'include',
+    'line', 'error', 'pragma',
+]


+################################
+# common functions
+################################
 def parse_device_type(runtime):
    device_type = ""

-    if runtime == "dsp":
+    if runtime == RuntimeType.dsp:
        device_type = "HEXAGON"
-    elif runtime == "gpu":
+    elif runtime == RuntimeType.gpu:
        device_type = "GPU"
-    elif runtime == "cpu":
+    elif runtime == RuntimeType.cpu:
        device_type = "CPU"

    return device_type
@@ -81,224 +175,272 @@ def get_hexagon_mode(configs):
        model_runtime = configs["models"][model_name].get("runtime", "")
        runtime_list.append(model_runtime.lower())

-    global_runtime = ""
    if "dsp" in runtime_list:
        return True
    return False


-def gen_opencl_and_tuning_code(target_abi,
-                               serialno,
-                               model_output_dirs,
-                               pull_or_not):
-    cl_built_kernel_file_name = "mace_cl_compiled_program.bin"
-    cl_platform_info_file_name = "mace_cl_platform_info.txt"
-    if pull_or_not:
-        sh_commands.pull_binaries(target_abi, serialno, model_output_dirs,
-                                  cl_built_kernel_file_name,
-                                  cl_platform_info_file_name)
-
-    # generate opencl binary code
-    sh_commands.gen_opencl_binary_code(model_output_dirs,
-                                       cl_built_kernel_file_name,
-                                       cl_platform_info_file_name)
+def md5sum(str):
+    md5 = hashlib.md5()
+    md5.update(str)
+    return md5.hexdigest()

-    sh_commands.gen_tuning_param_code(model_output_dirs)

+def sha256_checksum(fname):
+    hash_func = hashlib.sha256()
+    with open(fname, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_func.update(chunk)
+    return hash_func.hexdigest()

-def model_benchmark_stdout_processor(stdout,
-                                     abi,
-                                     serialno,
-                                     model_name,
-                                     device_type):
-    metrics = [0] * 3
-    for line in stdout.split('\n'):
-        line = line.strip()
-        parts = line.split()
-        if len(parts) == 4 and parts[0].startswith("time"):
-            metrics[0] = str(float(parts[1]))
-            metrics[1] = str(float(parts[2]))
-            metrics[2] = str(float(parts[3]))
-            break

-    device_name = ""
-    target_soc = ""
-    if abi != "host":
-        props = sh_commands.adb_getprop_by_serialno(serialno)
-        device_name = props.get("ro.product.model", "")
-        target_soc = props.get("ro.board.platform", "")
+def format_model_config(config_file_path):
+    with open(config_file_path) as f:
+        configs = yaml.load(f)

-    report_filename = FLAGS.output_dir + "/report.csv"
-    if not os.path.exists(report_filename):
-        with open(report_filename, 'w') as f:
-            f.write("model_name,device_name,soc,abi,runtime,"
-                    "init,warmup,run_avg\n")
+    library_name = configs.get(YAMLKeyword.library_name, "")
+    mace_check(len(library_name) > 0,
+               ModuleName.YAML_CONFIG, "library name shuold not be empty")
+
+    target_abis = configs.get(YAMLKeyword.target_abis, [])
+    mace_check((isinstance(target_abis, list) and len(target_abis) > 0),
+               ModuleName.YAML_CONFIG, "target_abis list is needed")
+    for abi in target_abis:
+        mace_check(abi in ABITypeStrs,
+                   ModuleName.YAML_CONFIG,
+                   "target_abis must be in " + str(ABITypeStrs))
+
+    target_socs = configs.get(YAMLKeyword.target_socs, "")
+    if not target_socs:
+        configs[YAMLKeyword.target_socs] = []
+    elif not isinstance(target_socs, list):
+        configs[YAMLKeyword.target_socs] = [target_socs]
+
+    if ABIType.host not in target_abis:
+        available_socs = sh_commands.adb_get_all_socs()
+        if YAMLKeyword.target_socs in configs:
+            target_socs = set(configs[YAMLKeyword.target_socs])
+            for soc in target_socs:
+                mace_check(soc in available_socs,
+                           ModuleName.YAML_CONFIG,
+                           "Build specified SOC library, "
+                           "you must plug in a phone using the SOC")
+
+    build_type = BuildType.code
+    build_type_str = configs.get(YAMLKeyword.build_type, "")
+    if build_type_str == BuildType.proto:
+        build_type = BuildType.proto
+    elif build_type_str == BuildType.code:
+        build_type = BuildType.code
+    else:
+        MaceLogger.error(ModuleName.YAML_CONFIG,
+                         "Invalid build type " + build_type_str
+                         + ". only support [proto|code] format, "
+                         + "proto for converting model to ProtoBuf file, "
+                         + "code for converting model to c++ code.")
+
+    embed_model_data = configs.get(YAMLKeyword.embed_model_data, "")
+    if embed_model_data == "" or not isinstance(embed_model_data, int) or \
+       embed_model_data < 0 or embed_model_data > 1:
+        MaceLogger.error(ModuleName.YAML_CONFIG,
+                         "embed_model_data must be 0 or 1. "
+                         "0 for embed model data to code, 1 not.")
+    if build_type == BuildType.proto:
+        configs[YAMLKeyword.embed_model_data] = 0
+
+    model_names = configs.get(YAMLKeyword.models, [])
+    mace_check(len(model_names) > 0, ModuleName.YAML_CONFIG,
+               "no model found in config file")
+
+    model_name_reg = re.compile(r'^[a-z0-9_]+$')
+    for model_name in model_names:
+        # check model_name legality
+        mace_check(model_name not in CPP_KEYWORDS,
+                   ModuleName.YAML_CONFIG,
+                   "model name should not be c++ keyword.")
+        mace_check((model_name[0] == '_' or model_name[0].isalpha())
+                   and bool(model_name_reg.match(model_name)),
+                   ModuleName.YAML_CONFIG,
+                   "model name shuold Meet the c++ naming convention"
+                   " which start with '_' or alpha"
+                   " and only contain alpha, number and '_'")
+
+        model_config = configs[YAMLKeyword.models][model_name]
+        platform = model_config.get(YAMLKeyword.platform, "")
+        mace_check(platform in PlatformTypeStrs,
+                   ModuleName.YAML_CONFIG,
+                   "'platform' must be in " + str(PlatformTypeStrs))
+
+        for key in [YAMLKeyword.model_file_path,
+                    YAMLKeyword.model_sha256_checksum]:
+            value = model_config.get(key, "")
+            mace_check(value != "", ModuleName.YAML_CONFIG,
+                       "'%s' is necessary" % key)
+
+        weight_file_path = model_config.get(YAMLKeyword.weight_file_path, "")
+        if weight_file_path:
+            weight_checksum =\
+                model_config.get(YAMLKeyword.weight_sha256_checksum, "")
+            mace_check(weight_checksum != "", ModuleName.YAML_CONFIG,
+                       "'%s' is necessary" %
+                       YAMLKeyword.weight_sha256_checksum)
+        else:
+            model_config[YAMLKeyword.weight_sha256_checksum] = ""
+
+        runtime = model_config.get(YAMLKeyword.runtime, "")
+        mace_check(runtime in RuntimeTypeStrs,
+                   ModuleName.YAML_CONFIG,
+                   "'runtime' must be in " + str(RuntimeTypeStrs))
+        if ABIType.host in target_abis:
+            mace_check(runtime == RuntimeType.cpu,
+                       ModuleName.YAML_CONFIG,
+                       "host only support cpu runtime now.")
+
+        data_type = model_config.get(YAMLKeyword.data_type, "")
+        if runtime == RuntimeType.cpu_gpu and data_type not in GPUDataTypeStrs:
+            model_config[YAMLKeyword.data_type] = \
+                GPUDataType.fp16_fp32.value
+        elif runtime == RuntimeType.cpu:
+            if len(data_type) > 0:
+                mace_check(data_type in CPUDataTypeStrs,
+                           ModuleName.YAML_CONFIG,
+                           "'data_type' must be in " + str(CPUDataTypeStrs)
+                           + " for cpu runtime")
+            else:
+                model_config[YAMLKeyword.data_type] = \
+                    CPUDataType.fp32.value
+        elif runtime == RuntimeType.gpu:
+            if len(data_type) > 0:
+                mace_check(data_type in GPUDataTypeStrs,
+                           ModuleName.YAML_CONFIG,
+                           "'data_type' must be in " + str(GPUDataTypeStrs)
+                           + " for gpu runtime")
+            else:
+                model_config[YAMLKeyword.data_type] =\
+                    GPUDataType.fp16_fp32.value
+
+        subgraphs = model_config.get(YAMLKeyword.subgraphs, "")
+        mace_check(len(subgraphs) > 0, ModuleName.YAML_CONFIG,
+                   "at least one subgraph is needed")
+
+        for subgraph in subgraphs:
+            for key in [YAMLKeyword.input_tensors,
+                        YAMLKeyword.input_shapes,
+                        YAMLKeyword.output_tensors,
+                        YAMLKeyword.output_shapes]:
+                value = subgraph.get(key, "")
+                mace_check(value != "", ModuleName.YAML_CONFIG,
+                           "'%s' is necessary in subgraph" % key)
+                if not isinstance(value, list):
+                    subgraph[key] = [value]

-    data_str = "{model_name},{device_name},{soc},{abi},{device_type}," \
-               "{init},{warmup},{run_avg}\n" \
-        .format(
-            model_name=model_name,
-            device_name=device_name,
-            soc=target_soc,
-            abi=abi,
-            device_type=device_type,
-            init=metrics[0],
-            warmup=metrics[1],
-            run_avg=metrics[2]
-        )
-    with open(report_filename, 'a') as f:
-        f.write(data_str)
+        for key in [YAMLKeyword.limit_opencl_kernel_time,
+                    YAMLKeyword.nnlib_graph_mode,
+                    YAMLKeyword.obfuscate,
+                    YAMLKeyword.winograd]:
+            value = model_config.get(key, "")
+            if value == "":
+                model_config[key] = 0

+        validation_inputs_data = model_config.get("validation_inputs_data",
+                                                  [])
+        model_config["validation_inputs_data"] = validation_inputs_data
+        if not isinstance(validation_inputs_data, list):
+            model_config["validation_inputs_data"] = [
+                validation_inputs_data]

-def tuning_run(target_abi,
-               serialno,
-               vlog_level,
-               embed_model_data,
-               model_output_dir,
-               input_nodes,
-               output_nodes,
-               input_shapes,
-               output_shapes,
-               mace_model_dir,
-               model_name,
-               device_type,
-               running_round,
-               restart_round,
-               out_of_range_check,
-               phone_data_dir,
-               tuning=False,
-               limit_opencl_kernel_time=0,
-               omp_num_threads=-1,
-               cpu_affinity_policy=1,
-               gpu_perf_hint=3,
-               gpu_priority_hint=3,
-               runtime_failure_ratio=0.0):
-    stdout = sh_commands.tuning_run(
-        target_abi,
-        serialno,
-        vlog_level,
-        embed_model_data,
-        model_output_dir,
-        input_nodes,
-        output_nodes,
-        input_shapes,
-        output_shapes,
-        mace_model_dir,
-        model_name,
-        device_type,
-        running_round,
-        restart_round,
-        limit_opencl_kernel_time,
-        tuning,
-        out_of_range_check,
-        phone_data_dir,
-        omp_num_threads,
-        cpu_affinity_policy,
-        gpu_perf_hint,
-        gpu_priority_hint,
-        runtime_failure_ratio,
-        valgrind=FLAGS.valgrind,
-        valgrind_path=FLAGS.valgrind_path,
-        valgrind_args=FLAGS.valgrind_args
-    )
+        weight_file_path = model_config.get("weight_file_path", "")
+        model_config["weight_file_path"] = weight_file_path

-    if running_round > 0 and FLAGS.collect_report:
-        model_benchmark_stdout_processor(
-            stdout, target_abi, serialno, model_name, device_type)
+    return configs


-def build_mace_run_prod(hexagon_mode, runtime, target_abi, serialno,
-                        vlog_level, embed_model_data, model_load_type,
-                        model_output_dir, input_nodes, output_nodes,
-                        input_shapes, output_shapes, mace_model_dir,
-                        model_name, device_type, running_round, restart_round,
-                        tuning, limit_opencl_kernel_time, phone_data_dir,
-                        enable_openmp):
-    mace_run_target = "//mace/tools/validation:mace_run"
-    strip = "always"
-    debug = False
-    if FLAGS.valgrind:
-        strip = "never"
-        debug = True
-
-    if not runtime or runtime == "gpu":
-        gen_opencl_and_tuning_code(target_abi, serialno, [], False)
-        sh_commands.bazel_build(
-            mace_run_target,
-            abi=target_abi,
-            production_mode=False,
-            hexagon_mode=hexagon_mode,
-            enable_openmp=enable_openmp
-        )
-        sh_commands.update_mace_run_lib(model_output_dir, model_load_type,
-                                        model_name, embed_model_data)
-
-        device_type = parse_device_type("gpu")
-        tuning_run(target_abi, serialno, vlog_level, embed_model_data,
-                   model_output_dir, input_nodes, output_nodes, input_shapes,
-                   output_shapes, mace_model_dir, model_name, device_type,
-                   running_round=0, restart_round=1, out_of_range_check=False,
-                   phone_data_dir=phone_data_dir, tuning=tuning,
-                   limit_opencl_kernel_time=limit_opencl_kernel_time)
-
-        gen_opencl_and_tuning_code(target_abi, serialno, [model_output_dir],
-                                   True)
-        sh_commands.bazel_build(
-            mace_run_target,
-            strip,
-            abi=target_abi,
-            production_mode=True,
-            hexagon_mode=hexagon_mode,
-            debug=debug,
-            enable_openmp=enable_openmp
-        )
-        sh_commands.update_mace_run_lib(model_output_dir, model_load_type,
-                                        model_name, embed_model_data)
+def get_build_binary_dir(library_name, target_abi, target_soc,
+                         serial_num):
+    if not target_soc or not serial_num:
+        binary_path_digest = md5sum(target_abi)
    else:
-        gen_opencl_and_tuning_code(target_abi, serialno, [], False)
-        sh_commands.bazel_build(
-            mace_run_target,
-            strip,
-            abi=target_abi,
-            production_mode=True,
-            hexagon_mode=hexagon_mode,
-            debug=debug,
-            enable_openmp=enable_openmp
-        )
-        sh_commands.update_mace_run_lib(model_output_dir, model_load_type,
-                                        model_name, embed_model_data)
-
-
-def merge_libs_and_tuning_results(target_soc,
-                                  target_abi,
-                                  serialno,
-                                  project_name,
-                                  output_dir,
-                                  model_output_dirs,
-                                  mace_model_dirs_kv,
-                                  model_load_type,
-                                  hexagon_mode,
-                                  embed_model_data):
-    gen_opencl_and_tuning_code(
-            target_abi, serialno, model_output_dirs, False)
-    sh_commands.build_production_code(model_load_type, target_abi)
+        device_name = sh_commands.adb_get_device_name_by_serialno(serial_num)\
+                .replace(' ', '')
+        binary_path_digest = md5sum(target_abi + target_soc + serial_num)
+        binary_path_digest = "%s_%s_%s" % \
+                             (device_name, target_soc, binary_path_digest)
+    return "%s/%s/%s/%s" % (
+        BUILD_OUTPUT_DIR, library_name, BUILD_TMP_DIR_NAME, binary_path_digest)
+
+
+def get_build_model_dirs(library_name, model_name, target_abi, target_soc,
+                         serial_num, model_file_path):
+    model_path_digest = md5sum(model_file_path)
+    model_output_base_dir = "%s/%s/%s/%s/%s" % (
+        BUILD_OUTPUT_DIR, library_name, BUILD_TMP_DIR_NAME,
+        model_name, model_path_digest)
+
+    if target_abi == ABIType.host:
+        model_output_dir = "%s/%s" % (model_output_base_dir, target_abi)
+    elif not target_soc or not serial_num:
+        model_output_dir = "%s/%s/%s" % (
+            model_output_base_dir, BUILD_TMP_GENERAL_OUTPUT_DIR_NAME,
+            target_abi)
+    else:
+        device_name = \
+            sh_commands.adb_get_device_name_by_serialno(serial_num)
+        model_output_dir = "%s/%s_%s/%s" % (
+            model_output_base_dir, device_name.replace(' ', ''),
+            target_soc, target_abi)

-    sh_commands.merge_libs(target_soc,
-                           target_abi,
-                           project_name,
-                           output_dir,
-                           model_output_dirs,
-                           mace_model_dirs_kv,
-                           model_load_type,
-                           hexagon_mode,
-                           embed_model_data)
+    mace_model_dir = \
+        '%s/%s/%s' % (BUILD_OUTPUT_DIR, library_name, MODEL_OUTPUT_DIR_NAME)
+
+    return model_output_base_dir, model_output_dir, mace_model_dir
+
+
+################################
+# build
+################################
+def pull_opencl_binary_and_tuning_param(target_abi,
+                                        serialno,
+                                        model_output_dirs):
+    cl_built_kernel_file_name = "mace_cl_compiled_program.bin"
+    cl_platform_info_file_name = "mace_cl_platform_info.txt"
+    sh_commands.pull_binaries(target_abi, serialno, model_output_dirs,
+                              cl_built_kernel_file_name,
+                              cl_platform_info_file_name)
+
+
+def gen_opencl_and_tuning_code(model_output_dirs):
+    cl_built_kernel_file_name = "mace_cl_compiled_program.bin"
+    cl_platform_info_file_name = "mace_cl_platform_info.txt"
+
+    # generate opencl binary code
+    sh_commands.gen_opencl_binary_code(model_output_dirs,
+                                       cl_built_kernel_file_name,
+                                       cl_platform_info_file_name)
+
+    sh_commands.gen_tuning_param_code(model_output_dirs)
+
+
+def print_configuration(flags, configs):
+    title = "Common Configuration"
+    header = ["key", "value"]
+    data = list()
+    data.append([YAMLKeyword.library_name,
+                 configs[YAMLKeyword.library_name]])
+    data.append([YAMLKeyword.target_abis,
+                 configs[YAMLKeyword.target_abis]])
+    data.append([YAMLKeyword.target_socs,
+                 configs[YAMLKeyword.target_socs]])
+    data.append([YAMLKeyword.build_type,
+                 configs[YAMLKeyword.build_type]])
+    data.append([YAMLKeyword.embed_model_data,
+                 configs[YAMLKeyword.embed_model_data]])
+    data.append(["Tuning", flags.tuning])
+    MaceLogger.summary(StringFormatter.table(header, data, title))


 def download_model_files(model_file_path,
                         model_output_dir,
                         weight_file_path=""):
-    model_file = ""
-    weight_file = ""
    if model_file_path.startswith("http://") or \
            model_file_path.startswith("https://"):
        model_file = model_output_dir + "/model.pb"
@@ -313,8 +455,6 @@ def download_model_files(model_file_path,
 def get_model_files_path(model_file_path,
                         model_output_dir,
                         weight_file_path=""):
-    model_file = ""
-    weight_file = ""
    if model_file_path.startswith("http://") or \
            model_file_path.startswith("https://"):
        model_file = model_output_dir + "/model.pb"
@@ -330,10 +470,540 @@ def get_model_files_path(model_file_path,
    return model_file, weight_file


-def md5sum(str):
-    md5 = hashlib.md5()
-    md5.update(str)
-    return md5.hexdigest()
+def convert_model(configs):
+    # Remove previous output dirs
+    library_name = configs[YAMLKeyword.library_name]
+    if not os.path.exists(BUILD_OUTPUT_DIR):
+        os.makedirs(BUILD_OUTPUT_DIR)
+    elif not os.path.exists(os.path.join(BUILD_OUTPUT_DIR, library_name)):
+        os.makedirs(os.path.join(BUILD_OUTPUT_DIR, library_name))
+
+    model_output_dir = \
+        '%s/%s/%s' % (BUILD_OUTPUT_DIR, library_name, MODEL_OUTPUT_DIR_NAME)
+    if os.path.exists(model_output_dir):
+        sh.rm("-rf", model_output_dir)
+    os.makedirs(model_output_dir)
+
+    embed_model_data = configs[YAMLKeyword.embed_model_data]
+
+    sh_commands.clear_model_codegen()
+    for model_name in configs[YAMLKeyword.models]:
+        MaceLogger.header(
+            StringFormatter.block("Convert %s model" % model_name))
+        model_config = configs[YAMLKeyword.models][model_name]
+        runtime = model_config[YAMLKeyword.runtime]
+
+        # Create model build directory
+        model_path_digest = md5sum(
+            model_config[YAMLKeyword.model_file_path])
+
+        model_output_base_dir = "%s/%s/%s/%s/%s" % (
+            BUILD_OUTPUT_DIR, library_name, BUILD_TMP_DIR_NAME,
+            model_name, model_path_digest)
+
+        if os.path.exists(model_output_base_dir):
+            sh.rm("-rf", model_output_base_dir)
+        os.makedirs(model_output_base_dir)
+
+        download_model_files(
+            model_config[YAMLKeyword.model_file_path],
+            model_output_base_dir,
+            model_config[YAMLKeyword.weight_file_path])
+
+        model_file_path, weight_file_path = get_model_files_path(
+            model_config[YAMLKeyword.model_file_path],
+            model_output_base_dir,
+            model_config[YAMLKeyword.weight_file_path])
+
+        if sha256_checksum(model_file_path) != \
+                model_config[YAMLKeyword.model_sha256_checksum]:
+            MaceLogger.error(ModuleName.MODEL_CONVERTER,
+                             "model file sha256checksum not match")
+
+        if weight_file_path:
+            if sha256_checksum(weight_file_path) != \
+                    model_config[YAMLKeyword.weight_sha256_checksum]:
+                MaceLogger.error(ModuleName.MODEL_CONVERTER,
+                                 "weight file sha256checksum not match")
+
+        data_type = model_config[YAMLKeyword.data_type]
+        if ABIType.host.value in configs[YAMLKeyword.target_abis]:
+            data_type = CPUDataType.fp32.value
+        # TODO(liuqi): support multiple subgraphs
+        subgraphs = model_config[YAMLKeyword.subgraphs]
+
+        model_codegen_dir = "mace/codegen/models/%s" % model_name
+        sh_commands.gen_model_code(
+            model_codegen_dir,
+            model_config[YAMLKeyword.platform],
+            model_file_path,
+            weight_file_path,
+            model_config[YAMLKeyword.model_sha256_checksum],
+            model_config[YAMLKeyword.weight_sha256_checksum],
+            ",".join(subgraphs[0][YAMLKeyword.input_tensors]),
+            ",".join(subgraphs[0][YAMLKeyword.output_tensors]),
+            runtime,
+            model_name,
+            ":".join(subgraphs[0][YAMLKeyword.input_shapes]),
+            model_config[YAMLKeyword.nnlib_graph_mode],
+            embed_model_data,
+            model_config[YAMLKeyword.winograd],
+            model_config[YAMLKeyword.obfuscate],
+            configs[YAMLKeyword.build_type],
+            data_type)
+
+        # mv pb and data file to build/model_name/model
+        if not embed_model_data:
+            sh_commands.mv_model_file_to_output_dir(
+                model_build_type=configs[YAMLKeyword.build_type],
+                model_codegen_dir=model_codegen_dir,
+                model_name=model_name,
+                output_dir=model_output_dir
+            )
+
+        MaceLogger.header(
+            StringFormatter.block("Model %s converted" % model_name))
+
+
+def build_specific_lib(target_abi, target_soc, serial_num,
+                       configs, tuning, enable_openmp,
+                       address_sanitizer):
+    mace_run_target = "//mace/tools/validation:mace_run"
+    library_name = configs[YAMLKeyword.library_name]
+    build_type = configs[YAMLKeyword.build_type]
+    embed_model_data = configs[YAMLKeyword.embed_model_data]
+    hexagon_mode = get_hexagon_mode(configs)
+    model_output_dirs = []
+
+    build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi,
+                                                target_soc, serial_num)
+    if os.path.exists(build_tmp_binary_dir):
+        sh.rm("-rf", build_tmp_binary_dir)
+    os.makedirs(build_tmp_binary_dir)
+
+    gen_opencl_and_tuning_code([])
+    sh_commands.bazel_build(
+        mace_run_target,
+        abi=target_abi,
+        hexagon_mode=hexagon_mode,
+        enable_openmp=enable_openmp,
+        address_sanitizer=address_sanitizer
+    )
+    sh_commands.update_mace_run_lib(build_tmp_binary_dir)
+    binary_changed = False
+
+    for model_name in configs[YAMLKeyword.models]:
+        model_config = configs[YAMLKeyword.models][model_name]
+        model_runtime = model_config[YAMLKeyword.runtime]
+        # Create model build directory
+        model_output_base_dir, model_output_dir, mace_model_dir = \
+            get_build_model_dirs(library_name, model_name, target_abi,
+                                 target_soc, serial_num,
+                                 model_config[YAMLKeyword.model_file_path])
+
+        model_output_dirs.append(model_output_dir)
+
+        if os.path.exists(model_output_dir):
+            sh.rm("-rf", model_output_dir)
+        os.makedirs(model_output_dir)
+
+        # build for specified soc
+        if not address_sanitizer and target_abi != ABIType.host \
+                and target_soc is not None and \
+                model_runtime in [RuntimeType.gpu, RuntimeType.cpu_gpu]:
+            sh_commands.clear_phone_data_dir(serial_num, PHONE_DATA_DIR)
+
+            subgraphs = model_config[YAMLKeyword.subgraphs]
+            # generate input data
+            input_file_list = model_config[YAMLKeyword.validation_inputs_data]
+            sh_commands.gen_random_input(
+                model_output_dir,
+                subgraphs[0][YAMLKeyword.input_tensors],
+                subgraphs[0][YAMLKeyword.input_shapes],
+                input_file_list)
+
+            device_type = parse_device_type(RuntimeType.gpu)
+            sh_commands.tuning_run(
+                abi=target_abi,
+                serialno=serial_num,
+                mace_run_dir=build_tmp_binary_dir,
+                vlog_level=0,
+                embed_model_data=embed_model_data,
+                model_output_dir=model_output_dir,
+                input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
+                output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
+                input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
+                output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
+                mace_model_dir=mace_model_dir,
+                model_tag=model_name,
+                device_type=device_type,
+                running_round=0,
+                restart_round=1,
+                limit_opencl_kernel_time=model_config[YAMLKeyword.limit_opencl_kernel_time],  # noqa
+                tuning=tuning,
+                out_of_range_check=False,
+                phone_data_dir=PHONE_DATA_DIR,
+                build_type=build_type
+            )
+
+            pull_opencl_binary_and_tuning_param(target_abi, serial_num,
+                                                [model_output_dir])
+            binary_changed = True
+
+    if binary_changed:
+        gen_opencl_and_tuning_code(model_output_dirs)
+        sh_commands.bazel_build(
+            mace_run_target,
+            abi=target_abi,
+            hexagon_mode=hexagon_mode,
+            enable_openmp=enable_openmp,
+            address_sanitizer=address_sanitizer
+        )
+        sh_commands.update_mace_run_lib(build_tmp_binary_dir)
+
+    if target_abi == ABIType.host:
+        sh_commands.build_host_libraries(build_type, target_abi)
+
+    # build benchmark_model binary
+    sh_commands.build_benchmark_model(target_abi,
+                                      build_tmp_binary_dir,
+                                      hexagon_mode)
+
+    # generate library
+    sh_commands.merge_libs(target_soc,
+                           target_abi,
+                           library_name,
+                           BUILD_OUTPUT_DIR,
+                           OUTPUT_LIBRARY_DIR_NAME,
+                           build_type,
+                           hexagon_mode)
+
+
+def generate_library(configs, tuning, enable_openmp, address_sanitizer):
+    MaceLogger.header(StringFormatter.block("Building library"))
+    # generate source
+    MaceLogger.info('* generate common source files...')
+    sh_commands.gen_mace_version()
+    sh_commands.gen_encrypted_opencl_source()
+    sh_commands.gen_mace_engine_factory_source(
+        configs[YAMLKeyword.models].keys(),
+        configs[YAMLKeyword.build_type])
+    MaceLogger.info('generate common source files done')
+
+    # create build dirs
+    library_name = configs[YAMLKeyword.library_name]
+    if not os.path.exists(BUILD_OUTPUT_DIR):
+        os.makedirs(BUILD_OUTPUT_DIR)
+    tmp_build_dir = os.path.join(BUILD_OUTPUT_DIR, library_name,
+                                 BUILD_TMP_DIR_NAME)
+    if not os.path.exists(tmp_build_dir):
+        os.makedirs(tmp_build_dir)
+    library_out_dir = os.path.join(BUILD_OUTPUT_DIR, library_name,
+                                   OUTPUT_LIBRARY_DIR_NAME)
+    if os.path.exists(library_out_dir):
+        sh.rm('-rf', library_out_dir)
+
+    target_socs = configs[YAMLKeyword.target_socs]
+    for target_abi in configs[YAMLKeyword.target_abis]:
+        if not target_socs or target_abi == ABIType.host.value:
+            build_specific_lib(target_abi, None, None, configs,
+                               tuning, enable_openmp, address_sanitizer)
+        else:
+            for target_soc in target_socs:
+                serial_num = sh_commands.get_target_soc_serial_number(
+                    target_soc)
+                with sh_commands.device_lock(serial_num):
+                    build_specific_lib(target_abi, target_soc, serial_num,
+                                       configs, tuning, enable_openmp,
+                                       address_sanitizer)
+
+    # package library
+    sh_commands.packaging_lib(BUILD_OUTPUT_DIR,
+                              configs[YAMLKeyword.library_name])
+
+
+def print_library_summary(configs):
+    library_name = configs[YAMLKeyword.library_name]
+    title = "Library"
+    header = ["key", "value"]
+    data = list()
+    data.append(["library package",
+                 "%s/%s/libmace_%s.tar.gz"
+                 % (BUILD_OUTPUT_DIR, library_name, library_name)])
+    MaceLogger.summary(StringFormatter.table(header, data, title))
+
+
+def build_library(flags):
+    configs = format_model_config(flags.config)
+
+    print_configuration(flags, configs)
+
+    convert_model(configs)
+
+    generate_library(configs, flags.tuning,
+                     flags.enable_openmp, flags.address_sanitizer)
+
+    print_library_summary(configs)
+
+
+################################
+# run
+################################
+def report_run_statistics(stdout,
+                          abi,
+                          serialno,
+                          model_name,
+                          device_type,
+                          output_dir):
+    metrics = [0] * 3
+    for line in stdout.split('\n'):
+        line = line.strip()
+        parts = line.split()
+        if len(parts) == 4 and parts[0].startswith("time"):
+            metrics[0] = str(float(parts[1]))
+            metrics[1] = str(float(parts[2]))
+            metrics[2] = str(float(parts[3]))
+            break
+
+    device_name = ""
+    target_soc = ""
+    if abi != "host":
+        props = sh_commands.adb_getprop_by_serialno(serialno)
+        device_name = props.get("ro.product.model", "")
+        target_soc = props.get("ro.board.platform", "")
+
+    report_filename = output_dir + "/report.csv"
+    if not os.path.exists(report_filename):
+        with open(report_filename, 'w') as f:
+            f.write("model_name,device_name,soc,abi,runtime,"
+                    "init,warmup,run_avg\n")
+
+    data_str = "{model_name},{device_name},{soc},{abi},{device_type}," \
+               "{init},{warmup},{run_avg}\n" \
+        .format(model_name=model_name,
+                device_name=device_name,
+                soc=target_soc,
+                abi=abi,
+                device_type=device_type,
+                init=metrics[0],
+                warmup=metrics[1],
+                run_avg=metrics[2]
+                )
+    with open(report_filename, 'a') as f:
+        f.write(data_str)
+
+
+def run_specific_target(flags, configs, target_abi,
+                        target_soc, serial_num):
+    library_name = configs[YAMLKeyword.library_name]
+    build_type = configs[YAMLKeyword.build_type]
+    embed_model_data = configs[YAMLKeyword.embed_model_data]
+    if not configs[YAMLKeyword.target_socs]:
+        build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi,
+                                                    None, None)
+    else:
+        build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi,
+                                                    target_soc, serial_num)
+
+    for model_name in configs[YAMLKeyword.models]:
+        model_config = configs[YAMLKeyword.models][model_name]
+        model_runtime = model_config[YAMLKeyword.runtime]
+        subgraphs = model_config[YAMLKeyword.subgraphs]
+
+        if not configs[YAMLKeyword.target_socs]:
+            model_output_base_dir, model_output_dir, mace_model_dir = \
+                get_build_model_dirs(library_name, model_name, target_abi,
+                                     None, None,
+                                     model_config[YAMLKeyword.model_file_path])
+        else:
+            model_output_base_dir, model_output_dir, mace_model_dir = \
+                get_build_model_dirs(library_name, model_name, target_abi,
+                                     target_soc, serial_num,
+                                     model_config[YAMLKeyword.model_file_path])
+        if target_abi != ABIType.host:
+            sh_commands.clear_phone_data_dir(serial_num, PHONE_DATA_DIR)
+
+        # generate input data
+        input_file_list = model_config[YAMLKeyword.validation_inputs_data]
+        sh_commands.gen_random_input(
+            model_output_dir,
+            subgraphs[0][YAMLKeyword.input_tensors],
+            subgraphs[0][YAMLKeyword.input_shapes],
+            input_file_list)
+        runtime_list = []
+        if target_abi == ABIType.host:
+            runtime_list.extend([RuntimeType.cpu])
+        elif model_runtime == RuntimeType.cpu_gpu:
+            runtime_list.extend([RuntimeType.cpu, RuntimeType.gpu])
+        else:
+            runtime_list.extend([model_runtime])
+        for runtime in runtime_list:
+            device_type = parse_device_type(runtime)
+            run_output = sh_commands.tuning_run(
+                abi=target_abi,
+                serialno=serial_num,
+                mace_run_dir=build_tmp_binary_dir,
+                vlog_level=flags.vlog_level,
+                embed_model_data=embed_model_data,
+                model_output_dir=model_output_dir,
+                input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
+                output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
+                input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
+                output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
+                mace_model_dir=mace_model_dir,
+                model_tag=model_name,
+                device_type=device_type,
+                running_round=flags.round,
+                restart_round=flags.restart_round,
+                limit_opencl_kernel_time=model_config[YAMLKeyword.limit_opencl_kernel_time],  # noqa
+                tuning=False,
+                out_of_range_check=flags.check_gpu_out_of_memory,
+                phone_data_dir=PHONE_DATA_DIR,
+                build_type=build_type,
+                omp_num_threads=flags.omp_num_threads,
+                cpu_affinity_policy=flags.cpu_affinity_policy,
+                gpu_perf_hint=flags.gpu_perf_hint,
+                gpu_priority_hint=flags.gpu_priority_hint,
+                runtime_failure_ratio=flags.runtime_failure_ratio,
+                address_sanitizer=flags.address_sanitizer,
+            )
+            if flags.validate:
+                model_file_path, weight_file_path = get_model_files_path(
+                    model_config["model_file_path"],
+                    model_output_base_dir,
+                    model_config["weight_file_path"])
+
+                sh_commands.validate_model(
+                    abi=target_abi,
+                    serialno=serial_num,
+                    model_file_path=model_file_path,
+                    weight_file_path=weight_file_path,
+                    platform=model_config[YAMLKeyword.platform],
+                    device_type=device_type,
+                    input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
+                    output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
+                    input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
+                    output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
+                    model_output_dir=model_output_dir,
+                    phone_data_dir=PHONE_DATA_DIR,
+                    caffe_env=flags.caffe_env)
+            if flags.report and flags.round > 0:
+                report_run_statistics(
+                    run_output, target_abi, serial_num,
+                    model_name, device_type, flags.report_dir)
+
+
+def run_mace(flags):
+    configs = format_model_config(flags.config)
+
+    target_socs = configs[YAMLKeyword.target_socs]
+    if not target_socs:
+        target_socs = sh_commands.adb_get_all_socs()
+    if ABIType.host not in configs[YAMLKeyword.target_abis] \
+            and not target_socs:
+        MaceLogger.warning('There is no device plugin the computer.')
+
+    for target_abi in configs[YAMLKeyword.target_abis]:
+        if target_abi == ABIType.host:
+            run_specific_target(flags, configs, target_abi, None, None)
+        else:
+            for target_soc in target_socs:
+                serial_num = sh_commands.get_target_soc_serial_number(
+                    target_soc)
+                with sh_commands.device_lock(serial_num):
+                    run_specific_target(flags, configs, target_abi,
+                                        target_soc, serial_num)
+
+
+################################
+#  benchmark model
+################################
+def bm_specific_target(flags, configs, target_abi, target_soc, serial_num):
+    library_name = configs[YAMLKeyword.library_name]
+    build_type = configs[YAMLKeyword.build_type]
+    embed_model_data = configs[YAMLKeyword.embed_model_data]
+    if not configs[YAMLKeyword.target_socs]:
+        build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi,
+                                                    None, None)
+    else:
+        build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi,
+                                                    target_soc, serial_num)
+
+    for model_name in configs[YAMLKeyword.models]:
+        model_config = configs[YAMLKeyword.models][model_name]
+        model_runtime = model_config[YAMLKeyword.runtime]
+        subgraphs = model_config[YAMLKeyword.subgraphs]
+
+        if not configs[YAMLKeyword.target_socs]:
+            model_output_base_dir, model_output_dir, mace_model_dir = \
+                get_build_model_dirs(library_name, model_name, target_abi,
+                                     None, None,
+                                     model_config[YAMLKeyword.model_file_path])
+        else:
+            model_output_base_dir, model_output_dir, mace_model_dir = \
+                get_build_model_dirs(library_name, model_name, target_abi,
+                                     target_soc, serial_num,
+                                     model_config[YAMLKeyword.model_file_path])
+        if target_abi != ABIType.host:
+            sh_commands.clear_phone_data_dir(serial_num, PHONE_DATA_DIR)
+
+        input_file_list = model_config[YAMLKeyword.validation_inputs_data]
+        sh_commands.gen_random_input(
+            model_output_dir,
+            subgraphs[0][YAMLKeyword.input_tensors],
+            subgraphs[0][YAMLKeyword.input_shapes],
+            input_file_list)
+        runtime_list = []
+        if target_abi == ABIType.host:
+            runtime_list.extend([RuntimeType.cpu])
+        elif model_runtime == RuntimeType.cpu_gpu:
+            runtime_list.extend([RuntimeType.cpu, RuntimeType.gpu])
+        else:
+            runtime_list.extend([model_runtime])
+        for runtime in runtime_list:
+            device_type = parse_device_type(runtime)
+            sh_commands.benchmark_model(
+                abi=target_abi,
+                serialno=serial_num,
+                benchmark_binary_dir=build_tmp_binary_dir,
+                vlog_level=0,
+                embed_model_data=embed_model_data,
+                model_output_dir=model_output_dir,
+                input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
+                output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
+                input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
+                output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
+                mace_model_dir=mace_model_dir,
+                model_tag=model_name,
+                device_type=device_type,
+                phone_data_dir=PHONE_DATA_DIR,
+                build_type=build_type,
+                omp_num_threads=flags.omp_num_threads,
+                cpu_affinity_policy=flags.cpu_affinity_policy,
+                gpu_perf_hint=flags.gpu_perf_hint,
+                gpu_priority_hint=flags.gpu_priority_hint)
+
+
+def benchmark_model(flags):
+    configs = format_model_config(flags.config)
+
+    target_socs = configs[YAMLKeyword.target_socs]
+    if not target_socs:
+        target_socs = sh_commands.adb_get_all_socs()
+    if ABIType.host.value not in configs[YAMLKeyword.target_abis] \
+            and not target_socs:
+        MaceLogger.warning('There is no device plugin the computer.')
+
+    for target_abi in configs[YAMLKeyword.target_abis]:
+        if target_abi == ABIType.host.value:
+            bm_specific_target(flags, configs, target_abi, None, None)
+        else:
+            for target_soc in target_socs:
+                serial_num = sh_commands.get_target_soc_serial_number(
+                    target_soc)
+                with sh_commands.device_lock(serial_num):
+                    bm_specific_target(flags, configs, target_abi,
+                                       target_soc, serial_num)


 ################################
@@ -350,515 +1020,121 @@ def str2bool(v):

 def str_to_caffe_env_type(v):
    if v.lower() == 'docker':
-        return common.CaffeEnvType.DOCKER
+        return CaffeEnvType.DOCKER
    elif v.lower() == 'local':
-        return common.CaffeEnvType.LOCAL
+        return CaffeEnvType.LOCAL
    else:
        raise argparse.ArgumentTypeError('[docker | local] expected.')


-def parse_model_configs():
-    print("============== Load and Parse configs ==============")
-    with open(FLAGS.config) as f:
-        configs = yaml.load(f)
-        target_abis = configs.get("target_abis", [])
-        if not isinstance(target_abis, list) or not target_abis:
-            print("CONFIG ERROR:")
-            print("target_abis list is needed!")
-            print("For example: 'target_abis: [armeabi-v7a, arm64-v8a]'")
-            exit(1)
-
-        embed_model_data = configs.get("embed_model_data", "")
-        if embed_model_data == "" or not isinstance(embed_model_data, int) or \
-                embed_model_data < 0 or embed_model_data > 1:
-            print("CONFIG ERROR:")
-            print("embed_model_data must be integer in range [0, 1]")
-            exit(1)
-        elif FLAGS.model_load_type == "pb":
-            configs["embed_model_data"] = 0
-            print("emebed_model_data is set 0")
-
-        model_names = configs.get("models", "")
-        if not model_names:
-            print("CONFIG ERROR:")
-            print("models attribute not found in config file")
-            exit(1)
-
-        for model_name in model_names:
-            model_config = configs["models"][model_name]
-            platform = model_config.get("platform", "")
-            if platform == "" or platform not in ["tensorflow", "caffe"]:
-                print("CONFIG ERROR:")
-                print("'platform' must be 'tensorflow' or 'caffe'")
-                exit(1)
-
-            for key in ["model_file_path", "model_sha256_checksum"]:
-                value = model_config.get(key, "")
-                if value == "":
-                    print("CONFIG ERROR:")
-                    print("'%s' is necessary" % key)
-                    exit(1)
-
-            for key in ["input_nodes", "input_shapes", "output_nodes",
-                        "output_shapes"]:
-                value = model_config.get(key, "")
-                if value == "":
-                    print("CONFIG ERROR:")
-                    print("'%s' is necessary" % key)
-                    exit(1)
-                if not isinstance(value, list):
-                    model_config[key] = [value]
-
-            for key in ["limit_opencl_kernel_time", "dsp_mode", "obfuscate",
-                        "fast_conv"]:
-                value = model_config.get(key, "")
-                if value == "":
-                    model_config[key] = 0
-                    print("'%s' for %s is set to default value: 0" %
-                          (key, model_name))
-
-            validation_inputs_data = model_config.get("validation_inputs_data",
-                                                      [])
-            model_config["validation_inputs_data"] = validation_inputs_data
-            if not isinstance(validation_inputs_data, list):
-                model_config["validation_inputs_data"] = [
-                        validation_inputs_data]
-
-            weight_file_path = model_config.get("weight_file_path", "")
-            model_config["weight_file_path"] = weight_file_path
-
-        print("Parse model configs successfully!\n")
-        return configs
-
-
 def parse_args():
    """Parses command line arguments."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--config",
+    all_type_parent_parser = argparse.ArgumentParser(add_help=False)
+    all_type_parent_parser.add_argument(
+        '--config',
        type=str,
-        default="./tool/config",
+        default="",
        required=True,
-        help="The global config file of models.")
-    parser.add_argument(
-        "--output_dir", type=str, default="build", help="The output dir.")
-    parser.add_argument(
-        "--round", type=int, default=1, help="The model running round.")
-    parser.add_argument(
-        "--run_seconds",
-        type=int,
-        default=10,
-        help="The model throughput test running seconds.")
-    parser.add_argument(
-        "--restart_round",
-        type=int,
-        default=1,
-        help="The model restart round.")
-    parser.add_argument(
-        "--tuning",
-        type=str2bool,
-        default=True,
-        help="Tune opencl params.")
-    parser.add_argument(
-        "--mode",
-        type=str,
-        default="all",
-        help="[build|run|validate|benchmark|merge|all|throughput_test].")
-    parser.add_argument(
-        "--target_socs",
-        type=str,
-        default="all",
-        help="SoCs to build, comma seperated list (getprop ro.board.platform)")
-    parser.add_argument(
-        "--out_of_range_check",
-        type=str2bool,
-        default=False,
-        help="Enable out of range check for opencl.")
-    parser.add_argument(
-        "--enable_openmp",
-        type=str2bool,
-        default=True,
-        help="Enable openmp.")
-    parser.add_argument(
+        help="model yaml configuration file path")
+    build_run_parent_parser = argparse.ArgumentParser(add_help=False)
+    build_run_parent_parser.add_argument(
+        '--address_sanitizer',
+        action="store_true",
+        help="Whether to use valgrind to check memory error")
+    run_bm_parent_parser = argparse.ArgumentParser(add_help=False)
+    run_bm_parent_parser.add_argument(
        "--omp_num_threads",
        type=int,
-        default=-1,
+        default=DefaultValues.omp_num_threads,
        help="num of openmp threads")
-    parser.add_argument(
+    run_bm_parent_parser.add_argument(
        "--cpu_affinity_policy",
        type=int,
-        default=1,
+        default=DefaultValues.cpu_affinity_policy,
        help="0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY")
-    parser.add_argument(
+    run_bm_parent_parser.add_argument(
        "--gpu_perf_hint",
        type=int,
-        default=3,
+        default=DefaultValues.gpu_perf_hint,
        help="0:DEFAULT/1:LOW/2:NORMAL/3:HIGH")
-    parser.add_argument(
+    run_bm_parent_parser.add_argument(
        "--gpu_priority_hint",
        type=int,
-        default=3,
+        default=DefaultValues.gpu_priority_hint,
        help="0:DEFAULT/1:LOW/2:NORMAL/3:HIGH")
-    parser.add_argument(
-        "--collect_report",
-        type=str2bool,
-        default=False,
-        help="Collect report.")
-    parser.add_argument(
-        "--vlog_level",
+
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers()
+    build = subparsers.add_parser(
+        'build',
+        parents=[all_type_parent_parser, build_run_parent_parser],
+        help='build model library and test tools')
+    build.set_defaults(func=build_library)
+    build.add_argument(
+        '--tuning',
+        action="store_true",
+        help="tuning gpu parameters for specified SOC")
+    build.add_argument(
+        "--enable_openmp",
+        action="store_false",
+        help="Enable openmp for multiple thread.")
+    run = subparsers.add_parser(
+        'run',
+        parents=[all_type_parent_parser, run_bm_parent_parser,
+                 build_run_parent_parser],
+        help='run model in command line')
+    run.set_defaults(func=run_mace)
+    run.add_argument(
+        "--round",
        type=int,
-        default=0,
-        help="VLOG level.")
-    parser.add_argument(
+        default=1,
+        help="The model running round.")
+    run.add_argument(
+        "--validate",
+        action="store_true",
+        help="validate result by comparing mace output and platform's output.")
+    run.add_argument(
        "--caffe_env",
        type=str_to_caffe_env_type,
        default='docker',
        help="[docker | local] caffe environment.")
-    parser.add_argument(
-        "--valgrind",
-        type=bool,
-        default=False,
-        help="Whether to use valgrind to check memory error.")
-    parser.add_argument(
-        "--valgrind_path",
-        type=str,
-        default="/data/local/tmp/valgrind",
-        help="Valgrind install path.")
-    parser.add_argument(
-        "--valgrind_args",
+    run.add_argument(
+        "--vlog_level",
+        type=int,
+        default=0,
+        help="VLOG level: [1~5].")
+    run.add_argument(
+        "--check_gpu_out_of_memory",
+        action="store_true",
+        help="Enable out of memory check for gpu.")
+    run.add_argument(
+        "--restart_round",
+        type=int,
+        default=1,
+        help="restart round for run.")
+    run.add_argument(
+        "--report",
+        action="store_true",
+        help="print run statistics report.")
+    run.add_argument(
+        "--report_dir",
        type=str,
        default="",
-        help="Valgrind command args.")
-    parser.add_argument(
-        "--validation_runtime",
-        type=str,
-        default="cpu",
-        help="validation runtime.")
-    parser.add_argument(
-        "--model_load_type",
-        type=str,
-        default="source",
-        help="[source|pb] Load models in generated `source` code" +
-                "or `pb` file.")
-    parser.add_argument(
-        "--gpu_data_type",
-        type=str,
-        default="half",
-        help="[half | float].")
-    parser.add_argument(
+        help="print run statistics report.")
+    run.add_argument(
        "--runtime_failure_ratio",
        type=float,
        default=0.0,
        help="[mock runtime failure ratio].")
+    benchmark = subparsers.add_parser(
+        'benchmark',
+        parents=[all_type_parent_parser, run_bm_parent_parser,
+                 build_run_parent_parser],
+        help='benchmark model for detail information')
+    benchmark.set_defaults(func=benchmark_model)
    return parser.parse_known_args()


-def process_models(project_name, configs, embed_model_data, vlog_level,
-                   target_abi, phone_data_dir, model_load_type,
-                   target_soc="", serialno=""):
-    hexagon_mode = get_hexagon_mode(configs)
-    model_output_dirs = []
-    mace_model_dirs_kv = {}
-
-    for model_name in configs["models"]:
-        print '===================', model_name, '==================='
-        model_config = configs["models"][model_name]
-        input_file_list = model_config["validation_inputs_data"]
-        model_runtime = model_config.get("runtime", "")
-        model_device_type = parse_device_type(model_runtime)
-        run_device_type = model_device_type
-        if not run_device_type:
-            run_device_type = parse_device_type(FLAGS.validation_runtime)
-        # Create model build directory
-        model_path_digest = md5sum(model_config["model_file_path"])
-        model_output_base_dir = "%s/%s/%s/%s/%s" % (
-            FLAGS.output_dir, project_name, "build",
-            model_name, model_path_digest)
-        if model_load_type == "pb":
-            mace_model_dir = model_output_base_dir
-            mace_model_dirs_kv[model_name] = mace_model_dir
-        else:
-            mace_model_dir = ""
-
-        if target_abi == "host":
-            model_output_dir = "%s/%s" % (model_output_base_dir, target_abi)
-        else:
-            device_name = sh_commands.adb_get_device_name_by_serialno(serialno)
-            model_output_dir = "%s/%s_%s/%s" % (
-                model_output_base_dir, device_name.replace(' ', ''),
-                target_soc, target_abi)
-            sh_commands.clear_phone_data_dir(serialno, phone_data_dir)
-
-        model_output_dirs.append(model_output_dir)
-
-        if FLAGS.mode == "build" or FLAGS.mode == "all":
-            if os.path.exists(model_output_dir):
-                sh.rm("-rf", model_output_dir)
-            os.makedirs(model_output_dir)
-
-        model_file_path, weight_file_path = get_model_files_path(
-                model_config["model_file_path"],
-                model_output_base_dir,
-                model_config["weight_file_path"])
-
-        if FLAGS.mode == "build" or FLAGS.mode == "run" or \
-                FLAGS.mode == "validate" or \
-                FLAGS.mode == "benchmark" or FLAGS.mode == "all":
-            sh_commands.gen_random_input(model_output_dir,
-                                         model_config["input_nodes"],
-                                         model_config["input_shapes"],
-                                         input_file_list)
-
-        if FLAGS.mode == "build" or FLAGS.mode == "all":
-            build_mace_run_prod(hexagon_mode,
-                                model_runtime,
-                                target_abi,
-                                serialno,
-                                vlog_level,
-                                embed_model_data,
-                                model_load_type,
-                                model_output_dir,
-                                model_config["input_nodes"],
-                                model_config["output_nodes"],
-                                model_config["input_shapes"],
-                                model_config["output_shapes"],
-                                mace_model_dir,
-                                model_name,
-                                model_device_type,
-                                FLAGS.round,
-                                FLAGS.restart_round,
-                                FLAGS.tuning,
-                                model_config["limit_opencl_kernel_time"],
-                                phone_data_dir,
-                                FLAGS.enable_openmp)
-            sh_commands.build_benchmark_model(target_abi,
-                                              embed_model_data,
-                                              model_output_dir,
-                                              model_name,
-                                              hexagon_mode)
-
-        if FLAGS.mode == "run" or FLAGS.mode == "validate" or \
-           FLAGS.mode == "all":
-            if FLAGS.mode == "run":
-                runtime_failure_ratio = FLAGS.runtime_failure_ratio
-            else:
-                runtime_failure_ratio = 0.0
-
-            tuning_run(target_abi,
-                       serialno,
-                       vlog_level,
-                       embed_model_data,
-                       model_output_dir,
-                       model_config["input_nodes"],
-                       model_config["output_nodes"],
-                       model_config["input_shapes"],
-                       model_config["output_shapes"],
-                       mace_model_dir,
-                       model_name,
-                       run_device_type,
-                       FLAGS.round,
-                       FLAGS.restart_round,
-                       FLAGS.out_of_range_check,
-                       phone_data_dir,
-                       omp_num_threads=FLAGS.omp_num_threads,
-                       cpu_affinity_policy=FLAGS.cpu_affinity_policy,
-                       gpu_perf_hint=FLAGS.gpu_perf_hint,
-                       gpu_priority_hint=FLAGS.gpu_priority_hint,
-                       runtime_failure_ratio=runtime_failure_ratio)
-
-        if FLAGS.mode == "benchmark":
-            gen_opencl_and_tuning_code(
-                    target_abi, serialno, [model_output_dir], False)
-            sh_commands.benchmark_model(target_abi,
-                                        serialno,
-                                        vlog_level,
-                                        embed_model_data,
-                                        model_output_dir,
-                                        mace_model_dir,
-                                        model_config["input_nodes"],
-                                        model_config["output_nodes"],
-                                        model_config["input_shapes"],
-                                        model_config["output_shapes"],
-                                        model_name,
-                                        run_device_type,
-                                        phone_data_dir,
-                                        FLAGS.omp_num_threads,
-                                        FLAGS.cpu_affinity_policy,
-                                        FLAGS.gpu_perf_hint,
-                                        FLAGS.gpu_priority_hint)
-
-        if FLAGS.mode == "validate" or FLAGS.mode == "all":
-            sh_commands.validate_model(target_abi,
-                                       serialno,
-                                       model_file_path,
-                                       weight_file_path,
-                                       model_config["platform"],
-                                       run_device_type,
-                                       model_config["input_nodes"],
-                                       model_config["output_nodes"],
-                                       model_config["input_shapes"],
-                                       model_config["output_shapes"],
-                                       model_output_dir,
-                                       phone_data_dir,
-                                       FLAGS.caffe_env)
-
-    if FLAGS.mode == "build" or FLAGS.mode == "merge" or \
-            FLAGS.mode == "all":
-        merge_libs_and_tuning_results(
-            target_soc,
-            target_abi,
-            serialno,
-            project_name,
-            FLAGS.output_dir,
-            model_output_dirs,
-            mace_model_dirs_kv,
-            model_load_type,
-            hexagon_mode,
-            embed_model_data)
-
-    if FLAGS.mode == "throughput_test":
-        merged_lib_file = FLAGS.output_dir + \
-                "/%s/%s/libmace_%s.%s.a" % \
-                (project_name, target_abi, project_name, target_soc)
-        first_model = configs["models"].values()[0]
-        throughput_test_output_dir = "%s/%s/%s/%s" % (
-                FLAGS.output_dir, project_name, "build",
-                "throughput_test")
-        if os.path.exists(throughput_test_output_dir):
-            sh.rm("-rf", throughput_test_output_dir)
-        os.makedirs(throughput_test_output_dir)
-        input_file_list = model_config["validation_inputs_data"]
-        sh_commands.gen_random_input(throughput_test_output_dir,
-                                     first_model["input_nodes"],
-                                     first_model["input_shapes"],
-                                     input_file_list)
-        model_tag_dict = {}
-        for model_name in configs["models"]:
-            runtime = configs["models"][model_name]["runtime"]
-            model_tag_dict[runtime] = model_name
-        sh_commands.build_run_throughput_test(target_abi,
-                                              serialno,
-                                              vlog_level,
-                                              FLAGS.run_seconds,
-                                              merged_lib_file,
-                                              throughput_test_output_dir,
-                                              embed_model_data,
-                                              model_config["input_nodes"],
-                                              model_config["output_nodes"],
-                                              model_config["input_shapes"],
-                                              model_config["output_shapes"],
-                                              model_tag_dict.get("cpu", ""),
-                                              model_tag_dict.get("gpu", ""),
-                                              model_tag_dict.get("dsp", ""),
-                                              phone_data_dir)
-
-
-def main(unused_args):
-    common.init_logging()
-    configs = parse_model_configs()
-
-    if FLAGS.mode == "validate":
-        FLAGS.round = 1
-        FLAGS.restart_round = 1
-
-    project_name = os.path.splitext(os.path.basename(FLAGS.config))[0]
-    if FLAGS.mode == "build" or FLAGS.mode == "all":
-        # Remove previous output dirs
-        if not os.path.exists(FLAGS.output_dir):
-            os.makedirs(FLAGS.output_dir)
-        elif os.path.exists(os.path.join(FLAGS.output_dir, "libmace")):
-            sh.rm("-rf", os.path.join(FLAGS.output_dir, project_name))
-            os.makedirs(os.path.join(FLAGS.output_dir, project_name))
-
-        # generate source
-        sh_commands.gen_mace_version()
-        sh_commands.gen_encrypted_opencl_source()
-        sh_commands.gen_mace_engine_factory_source(configs['models'].keys(),
-                                                   FLAGS.model_load_type)
-
-    embed_model_data = configs["embed_model_data"]
-    target_socs = get_target_socs(configs)
-
-    vlog_level = FLAGS.vlog_level
-    phone_data_dir = "/data/local/tmp/mace_run/"
-
-    if FLAGS.mode == "build" or FLAGS.mode == "all":
-        print '* Model Convert'
-        sh_commands.clear_model_codegen()
-        for model_name in configs["models"]:
-            print '===================', model_name, '==================='
-            model_config = configs["models"][model_name]
-            runtime = model_config.get("runtime", "")
-
-            # Create model build directory
-            model_path_digest = md5sum(model_config["model_file_path"])
-
-            model_output_base_dir = "%s/%s/%s/%s/%s" % (
-                FLAGS.output_dir, project_name, "build",
-                model_name, model_path_digest)
-
-            if os.path.exists(model_output_base_dir):
-                sh.rm("-rf", model_output_base_dir)
-            os.makedirs(model_output_base_dir)
-
-            download_model_files(
-                model_config["model_file_path"],
-                model_output_base_dir,
-                model_config["weight_file_path"])
-
-            model_file_path, weight_file_path = get_model_files_path(
-                model_config["model_file_path"],
-                model_output_base_dir,
-                model_config["weight_file_path"])
-
-            sh_commands.gen_model_code(
-                "mace/codegen/models/%s" % model_name,
-                model_config["platform"],
-                model_file_path,
-                weight_file_path,
-                model_config["model_sha256_checksum"],
-                ",".join(model_config["input_nodes"]),
-                ",".join(model_config["output_nodes"]),
-                runtime,
-                model_name,
-                ":".join(model_config["input_shapes"]),
-                model_config["dsp_mode"],
-                embed_model_data,
-                model_config["fast_conv"],
-                model_config["obfuscate"],
-                model_output_base_dir,
-                FLAGS.model_load_type,
-                FLAGS.gpu_data_type)
-
-    for target_abi in configs["target_abis"]:
-        for target_soc in target_socs:
-            if target_abi != 'host':
-                serialnos = sh_commands.get_target_socs_serialnos([target_soc])
-                for serialno in serialnos:
-                    props = sh_commands.adb_getprop_by_serialno(serialno)
-                    print(
-                        "===================================================="
-                    )
-                    print("Trying to lock device %s" % serialno)
-                    with sh_commands.device_lock(serialno):
-                        print("Run on device: %s, %s, %s" % (
-                            serialno, props["ro.board.platform"],
-                              props["ro.product.model"]))
-                        process_models(project_name, configs, embed_model_data,
-                                       vlog_level, target_abi, phone_data_dir,
-                                       FLAGS.model_load_type, target_soc,
-                                       serialno)
-            else:
-                print("====================================================")
-                print("Run on host")
-                process_models(project_name, configs, embed_model_data,
-                               vlog_level, target_abi, phone_data_dir,
-                               FLAGS.model_load_type)
-
-    if FLAGS.mode == "build" or FLAGS.mode == "all":
-        sh_commands.packaging_lib(FLAGS.output_dir, project_name)
-
-
 if __name__ == "__main__":
-    FLAGS, unparsed = parse_args()
-    main(unused_args=[sys.argv[0]] + unparsed)
+    flags, unparsed = parse_args()
+    flags.func(flags)
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -23,6 +23,7 @@ import subprocess
 import sys
 import time
 import urllib
+from enum import Enum

 import common

@@ -48,6 +49,12 @@ def strip_invalid_utf8(str):
    return sh.iconv(str, "-c", "-t", "UTF-8")


+def split_stdout(stdout_str):
+    stdout_str = strip_invalid_utf8(stdout_str)
+    # Filter out last empty line
+    return [l.strip() for l in stdout_str.split('\n') if len(l.strip()) > 0]
+
+
 def make_output_processor(buff):
    def process_output(line):
        print(line.rstrip())
@@ -72,6 +79,11 @@ def is_device_locked(serialno):
        return True


+class BuildType(object):
+    proto = 'proto'
+    code = 'code'
+
+
 ################################
 # clear data
 ################################
@@ -90,16 +102,10 @@ def clear_model_codegen(model_codegen_dir="mace/codegen/models"):
 ################################
 # adb commands
 ################################
-def adb_split_stdout(stdout_str):
-    stdout_str = strip_invalid_utf8(stdout_str)
-    # Filter out last empty line
-    return [l.strip() for l in stdout_str.split('\n') if len(l.strip()) > 0]
-
-
 def adb_devices():
    serialnos = []
    p = re.compile(r'(\w+)\s+device')
-    for line in adb_split_stdout(sh.adb("devices")):
+    for line in split_stdout(sh.adb("devices")):
        m = p.match(line)
        if m:
            serialnos.append(m.group(1))
@@ -128,9 +134,26 @@ def get_target_socs_serialnos(target_socs=None):
    return serialnos


+def get_soc_serial_number_map():
+    serial_numbers = adb_devices()
+    soc_serial_number_map = {}
+    for num in serial_numbers:
+        props = adb_getprop_by_serialno(num)
+        soc_serial_number_map[props["ro.board.platform"]] = num
+    return soc_serial_number_map
+
+
+def get_target_soc_serial_number(target_soc):
+    soc_serial_number_map = get_soc_serial_number_map()
+    serial_number = None
+    if target_soc in soc_serial_number_map:
+        serial_number = soc_serial_number_map[target_soc]
+    return serial_number
+
+
 def adb_getprop_by_serialno(serialno):
    outputs = sh.adb("-s", serialno, "shell", "getprop")
-    raw_props = adb_split_stdout(outputs)
+    raw_props = split_stdout(outputs)
    props = {}
    p = re.compile(r'\[(.+)\]: \[(.+)\]')
    for raw_prop in raw_props:
@@ -173,14 +196,16 @@ def adb_pull(src_path, dst_path, serialno):
        print("Error msg: %s" % e.stderr)


-def adb_run(serialno,
+def adb_run(abi,
+            serialno,
            host_bin_path,
            bin_name,
            args="",
            opencl_profiling=1,
            vlog_level=0,
            device_bin_path="/data/local/tmp/mace",
-            out_of_range_check=1):
+            out_of_range_check=1,
+            address_sanitizer=False):
    host_bin_full_path = "%s/%s" % (host_bin_path, bin_name)
    device_bin_full_path = "%s/%s" % (device_bin_path, bin_name)
    props = adb_getprop_by_serialno(serialno)
@@ -195,17 +220,24 @@ def adb_run(serialno,
        sh.adb("-s", serialno, "shell", "rm -rf %s" % device_bin_path)
        sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path)
        adb_push(host_bin_full_path, device_bin_full_path, serialno)
+        ld_preload = ""
+        if address_sanitizer:
+            adb_push(find_asan_rt_library(abi), device_bin_path, serialno)
+            ld_preload = "LD_PRELOAD=%s/%s" % (device_bin_path,
+                                               asan_rt_library_names(abi)),
        print("Run %s" % device_bin_full_path)
+
        stdout_buff = []
        process_output = make_output_processor(stdout_buff)
        p = sh.adb(
            "-s",
            serialno,
            "shell",
-            "MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d "
-            "MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" %
-            (out_of_range_check, opencl_profiling, vlog_level,
-             device_bin_full_path, args),
+            ld_preload,
+            "MACE_OUT_OF_RANGE_CHECK=%d" % out_of_range_check,
+            "MACE_OPENCL_PROFILING=%d" % opencl_profiling,
+            "MACE_CPP_MIN_VLOG_LEVEL=%d" % vlog_level,
+            "%s %s" % (device_bin_full_path, args),
            _out=process_output,
            _bg=True,
            _err_to_out=True)
@@ -213,128 +245,77 @@ def adb_run(serialno,
        return "".join(stdout_buff)


-def adb_run_valgrind(serialno,
-                     host_bin_path,
-                     bin_name,
-                     valgrind_path="/data/local/tmp/valgrind",
-                     valgrind_args="",
-                     args="",
-                     opencl_profiling=1,
-                     vlog_level=0,
-                     device_bin_path="/data/local/tmp/mace",
-                     out_of_range_check=1):
-    valgrind_lib = valgrind_path + "/lib/valgrind"
-    valgrind_bin = valgrind_path + "/bin/valgrind"
-    host_bin_full_path = "%s/%s" % (host_bin_path, bin_name)
-    device_bin_full_path = "%s/%s" % (device_bin_path, bin_name)
-    props = adb_getprop_by_serialno(serialno)
-    print(
-        "====================================================================="
-    )
-    print("Trying to lock device %s" % serialno)
-    with device_lock(serialno):
-        print("Run on device: %s, %s, %s" %
-              (serialno, props["ro.board.platform"],
-               props["ro.product.model"]))
-        result = sh.adb("-s", serialno, "shell", "ls %s" % valgrind_path)
-        if result.startswith("ls:"):
-            print("Please install valgrind to %s manually." % valgrind_path)
-            return result
-        sh.adb("-s", serialno, "shell", "rm -rf %s" % device_bin_path)
-        sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path)
-        adb_push(host_bin_full_path, device_bin_full_path, serialno)
-        print("Run %s" % device_bin_full_path)
-        stdout_buff = []
-        process_output = make_output_processor(stdout_buff)
-        p = sh.adb(
-            "-s",
-            serialno,
-            "shell",
-            "MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d "
-            "MACE_CPP_MIN_VLOG_LEVEL=%d VALGRIND_LIB=%s %s %s %s %s " %
-            (out_of_range_check, opencl_profiling, vlog_level,
-             valgrind_lib, valgrind_bin, valgrind_args,
-             device_bin_full_path, args),
-            _out=process_output,
-            _bg=True,
-            _err_to_out=True)
-        p.wait()
-        return "".join(stdout_buff)
+################################
+# Toolchain
+################################
+def asan_rt_library_names(abi):
+    asan_rt_names = {
+        "armeabi-v7a": "libclang_rt.asan-arm-android.so",
+        "arm64-v8a": "libclang_rt.asan-aarch64-android.so",
+    }
+    return asan_rt_names[abi]
+
+
+def find_asan_rt_library(abi, asan_rt_path=''):
+    if not asan_rt_path:
+        find_path = os.environ['ANDROID_NDK_HOME']
+        candidates = split_stdout(sh.find(find_path, "-name",
+                                          asan_rt_library_names(abi)))
+        if len(candidates) == 0:
+            common.MaceLogger.error(
+                "Toolchain",
+                "Can't find AddressSanitizer runtime library in % s" %
+                find_path)
+        elif len(candidates) > 1:
+            common.MaceLogger.info(
+                "More than one AddressSanitizer runtime library, use the 1st")
+        return candidates[0]
+    return "%s/%s" % (asan_rt_path, asan_rt_library_names(abi))


 ################################
 # bazel commands
 ################################
 def bazel_build(target,
-                strip="always",
                abi="armeabi-v7a",
-                production_mode=False,
                hexagon_mode=False,
-                disable_no_tuning_warning=False,
-                debug=False,
                enable_openmp=True,
-                enable_neon=True):
+                enable_neon=True,
+                address_sanitizer=False):
    print("* Build %s with ABI %s" % (target, abi))
    stdout_buff = []
    process_output = make_output_processor(stdout_buff)
    if abi == "host":
        bazel_args = (
            "build",
-            "-c",
-            "opt",
-            "--strip",
-            strip,
-            "--verbose_failures",
-            target,
-            "--copt=-std=c++11",
-            "--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
-            "--copt=-O3",
            "--define",
            "openmp=%s" % str(enable_openmp).lower(),
-            "--define",
-            "production=%s" % str(production_mode).lower(),
+            target,
        )
-        p = sh.bazel(
-            *bazel_args,
-            _out=process_output,
-            _bg=True,
-            _err_to_out=True)
-        p.wait()
    else:
        bazel_args = (
            "build",
-            "-c",
-            "opt",
-            "--strip",
-            strip,
-            "--verbose_failures",
            target,
-            "--crosstool_top=//external:android/crosstool",
-            "--host_crosstool_top=@bazel_tools//tools/cpp:toolchain",
+            "--config",
+            "android",
            "--cpu=%s" % abi,
-            "--copt=-std=c++11",
-            "--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
-            "--copt=-DMACE_OBFUSCATE_LITERALS",
-            "--copt=-O3",
            "--define",
            "neon=%s" % str(enable_neon).lower(),
            "--define",
            "openmp=%s" % str(enable_openmp).lower(),
            "--define",
-            "production=%s" % str(production_mode).lower(),
-            "--define",
            "hexagon=%s" % str(hexagon_mode).lower())
-        if disable_no_tuning_warning:
-            bazel_args += ("--copt=-DMACE_DISABLE_NO_TUNING_WARNING",)
-        if debug:
-            bazel_args += ("--copt=-g",)
-        p = sh.bazel(
-            _out=process_output,
-            _bg=True,
-            _err_to_out=True,
-            *bazel_args)
-        p.wait()
-    print("Building done!\n")
+    if address_sanitizer:
+        bazel_args += ("--config", "asan")
+    else:
+        bazel_args += ("--config", "optimization")
+    p = sh.bazel(
+        _out=process_output,
+        _bg=True,
+        _err_to_out=True,
+        *bazel_args)
+    p.wait()
+    print("Build done!\n")
    return "".join(stdout_buff)


@@ -461,6 +442,7 @@ def gen_model_code(model_codegen_dir,
                   model_file_path,
                   weight_file_path,
                   model_sha256_checksum,
+                   weight_sha256_checksum,
                   input_nodes,
                   output_nodes,
                   runtime,
@@ -470,10 +452,8 @@ def gen_model_code(model_codegen_dir,
                   embed_model_data,
                   fast_conv,
                   obfuscate,
-                   model_output_dir,
-                   model_load_type,
-                   gpu_data_type):
-    print("* Genearte model code")
+                   model_build_type,
+                   data_type):
    bazel_build_common("//mace/python/tools:converter")

    if os.path.exists(model_codegen_dir):
@@ -488,6 +468,7 @@ def gen_model_code(model_codegen_dir,
                  "--model_file=%s" % model_file_path,
                  "--weight_file=%s" % weight_file_path,
                  "--model_checksum=%s" % model_sha256_checksum,
+                  "--weight_checksum=%s" % weight_sha256_checksum,
                  "--input_node=%s" % input_nodes,
                  "--output_node=%s" % output_nodes,
                  "--runtime=%s" % runtime,
@@ -498,15 +479,13 @@ def gen_model_code(model_codegen_dir,
                  "--embed_model_data=%s" % embed_model_data,
                  "--winograd=%s" % fast_conv,
                  "--obfuscate=%s" % obfuscate,
-                  "--codegen_output=%s/model.cc" % model_codegen_dir,
-                  "--pb_output=%s/%s.pb" % (model_output_dir, model_tag),
-                  "--model_load_type=%s" % model_load_type,
-                  "--gpu_data_type=%s" % gpu_data_type,
+                  "--output_dir=%s" % model_codegen_dir,
+                  "--model_build_type=%s" % model_build_type,
+                  "--data_type=%s" % data_type,
                  _out=process_output,
                  _bg=True,
                  _err_to_out=True)
    p.wait()
-    print("Model code gen done!\n")


 def gen_random_input(model_output_dir,
@@ -551,22 +530,25 @@ def gen_random_input(model_output_dir,
                    sh.cp("-f", input_file_list[i], dst_input_file)


-def update_mace_run_lib(model_output_dir,
-                        model_load_type,
-                        model_tag,
-                        embed_model_data):
+def update_mace_run_lib(model_output_dir):
    mace_run_filepath = model_output_dir + "/mace_run"
    if os.path.exists(mace_run_filepath):
        sh.rm("-rf", mace_run_filepath)
    sh.cp("-f", "bazel-bin/mace/tools/validation/mace_run", model_output_dir)

-    if embed_model_data == 0:
-        sh.cp("-f", "mace/codegen/models/%s/%s.data" % (model_tag, model_tag),
-              model_output_dir)

-    if model_load_type == "source":
-        sh.cp("-f", "mace/codegen/models/%s/%s.h" % (model_tag, model_tag),
-              model_output_dir)
+def mv_model_file_to_output_dir(
+        model_build_type,
+        model_codegen_dir,
+        model_name,
+        output_dir):
+    if model_build_type == BuildType.proto:
+        sh.mv("-f",
+              '%s/%s.pb' % (model_codegen_dir, model_name),
+              output_dir)
+    sh.mv("-f",
+          '%s/%s.data' % (model_codegen_dir, model_name),
+          output_dir)


 def create_internal_storage_dir(serialno, phone_data_dir):
@@ -577,6 +559,7 @@ def create_internal_storage_dir(serialno, phone_data_dir):

 def tuning_run(abi,
               serialno,
+               mace_run_dir,
               vlog_level,
               embed_model_data,
               model_output_dir,
@@ -593,33 +576,31 @@ def tuning_run(abi,
               tuning,
               out_of_range_check,
               phone_data_dir,
+               build_type,
               omp_num_threads=-1,
               cpu_affinity_policy=1,
               gpu_perf_hint=3,
               gpu_priority_hint=3,
-               runtime_failure_ratio=0.0,
-               valgrind=False,
-               valgrind_path="/data/local/tmp/valgrind",
-               valgrind_args="",
               input_file_name="model_input",
-               output_file_name="model_out"):
+               output_file_name="model_out",
+               runtime_failure_ratio=0.0,
+               address_sanitizer=False):
    print("* Run '%s' with round=%s, restart_round=%s, tuning=%s, "
          "out_of_range_check=%s, omp_num_threads=%s, cpu_affinity_policy=%s, "
          "gpu_perf_hint=%s, gpu_priority_hint=%s" %
          (model_tag, running_round, restart_round, str(tuning),
           str(out_of_range_check), omp_num_threads, cpu_affinity_policy,
           gpu_perf_hint, gpu_priority_hint))
+    mace_model_path = ""
+    if build_type == BuildType.proto:
+        mace_model_path = "%s/%s.pb" % (mace_model_dir, model_tag)
    if abi == "host":
-        if mace_model_dir:
-            mace_model_path = "%s/%s.pb" % (mace_model_dir, model_tag)
-        else:
-            mace_model_path = ""
        p = subprocess.Popen(
            [
                "env",
                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
                "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
-                "%s/mace_run" % model_output_dir,
+                "%s/mace_run" % mace_run_dir,
                "--model_name=%s" % model_tag,
                "--input_node=%s" % ",".join(input_nodes),
                "--output_node=%s" % ",".join(output_nodes),
@@ -627,7 +608,7 @@ def tuning_run(abi,
                "--output_shape=%s" % ":".join(output_shapes),
                "--input_file=%s/%s" % (model_output_dir, input_file_name),
                "--output_file=%s/%s" % (model_output_dir, output_file_name),
-                "--model_data_file=%s/%s.data" % (model_output_dir, model_tag),
+                "--model_data_file=%s/%s.data" % (mace_model_dir, model_tag),
                "--device=%s" % device_type,
                "--round=%s" % running_round,
                "--restart_round=%s" % restart_round,
@@ -654,21 +635,25 @@ def tuning_run(abi,
                                                        input_name)
            adb_push("%s/%s" % (model_output_dir, formatted_name),
                     phone_data_dir, serialno)
-        adb_push("%s/mace_run" % model_output_dir, phone_data_dir,
-                 serialno)
+        if address_sanitizer:
+            adb_push(find_asan_rt_library(abi), phone_data_dir, serialno)
+
        if not embed_model_data:
-            adb_push("%s/%s.data" % (model_output_dir, model_tag),
+            adb_push("%s/%s.data" % (mace_model_dir, model_tag),
                     phone_data_dir, serialno)
+
        adb_push("third_party/nnlib/libhexagon_controller.so",
                 phone_data_dir, serialno)

-        if mace_model_dir:
-            mace_model_path = "%s/%s.pb" % (phone_data_dir, model_tag)
-            adb_push("%s/%s.pb" % (mace_model_dir, model_tag),
-                     mace_model_path,
+        mace_model_phone_path = ""
+        if build_type == BuildType.proto:
+            mace_model_phone_path = "%s/%s.pb" % (phone_data_dir, model_tag)
+            adb_push(mace_model_path,
+                     mace_model_phone_path,
                     serialno)
-        else:
-            mace_model_path = ""
+
+        adb_push("%s/mace_run" % mace_run_dir, phone_data_dir,
+                 serialno)

        stdout_buff = []
        process_output = make_output_processor(stdout_buff)
@@ -682,11 +667,10 @@ def tuning_run(abi,
            "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" % limit_opencl_kernel_time,
            "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
        ]
-        if valgrind:
+        if address_sanitizer:
            adb_cmd.extend([
-                "VALGRIND_LIB=%s" % valgrind_path + "/lib/valgrind",
-                valgrind_path + "/bin/valgrind",
-                valgrind_args
+                "LD_PRELOAD=%s/%s" % (phone_data_dir,
+                                      asan_rt_library_names(abi))
            ])
        adb_cmd.extend([
            "%s/mace_run" % phone_data_dir,
@@ -705,7 +689,7 @@ def tuning_run(abi,
            "--cpu_affinity_policy=%s" % cpu_affinity_policy,
            "--gpu_perf_hint=%s" % gpu_perf_hint,
            "--gpu_priority_hint=%s" % gpu_priority_hint,
-            "--model_file=%s" % mace_model_path,
+            "--model_file=%s" % mace_model_phone_path,
        ])
        adb_cmd = ' '.join(adb_cmd)
        p = sh.adb(
@@ -756,7 +740,6 @@ def validate_model(abi,
    elif platform == "caffe":
        image_name = "mace-caffe:latest"
        container_name = "mace_caffe_validator"
-        res_file = "validation.result"

        if caffe_env == common.CaffeEnvType.LOCAL:
            import imp
@@ -843,60 +826,51 @@ def validate_model(abi,
    print("Validation done!\n")


-def build_production_code(model_load_type, abi):
+def build_host_libraries(model_build_type, abi):
+    bazel_build("@com_google_protobuf//:protobuf_lite", abi=abi)
+    bazel_build("//mace/proto:mace_cc", abi=abi)
    bazel_build("//mace/codegen:generated_opencl", abi=abi)
    bazel_build("//mace/codegen:generated_tuning_params", abi=abi)
-    if abi == 'host':
-        if model_load_type == "source":
-            bazel_build(
-                "//mace/codegen:generated_models",
-                abi=abi)
-        else:
-            bazel_build("//mace/core:core", abi=abi)
-            bazel_build("//mace/ops:ops", abi=abi)
+    bazel_build("//mace/codegen:generated_version", abi=abi)
+    bazel_build("//mace/utils:utils", abi=abi)
+    bazel_build("//mace/core:core", abi=abi)
+    bazel_build("//mace/kernels:kernels", abi=abi)
+    bazel_build("//mace/ops:ops", abi=abi)
+    if model_build_type == BuildType.code:
+        bazel_build(
+            "//mace/codegen:generated_models",
+            abi=abi)


 def merge_libs(target_soc,
               abi,
               project_name,
-               libmace_output_dir,
-               model_output_dirs,
-               mace_model_dirs_kv,
-               model_load_type,
-               hexagon_mode,
-               embed_model_data):
+               build_output_dir,
+               library_output_dir,
+               model_build_type,
+               hexagon_mode):
    print("* Merge mace lib")
-    project_output_dir = "%s/%s" % (libmace_output_dir, project_name)
+    project_output_dir = "%s/%s" % (build_output_dir, project_name)
    model_header_dir = "%s/include/mace/public" % project_output_dir
-    model_data_dir = "%s/data" % project_output_dir
    hexagon_lib_file = "third_party/nnlib/libhexagon_controller.so"
-    model_bin_dir = "%s/%s/" % (project_output_dir, abi)
-
-    if not os.path.exists(model_bin_dir):
-        sh.mkdir("-p", model_bin_dir)
-    if not os.path.exists(model_header_dir):
-        sh.mkdir("-p", model_header_dir)
+    model_bin_dir = "%s/%s/%s/" % (project_output_dir, library_output_dir, abi)
+
+    if os.path.exists(model_bin_dir):
+        sh.rm("-rf", model_bin_dir)
+    sh.mkdir("-p", model_bin_dir)
+    if os.path.exists(model_header_dir):
+        sh.rm("-rf", model_header_dir)
+    sh.mkdir("-p", model_header_dir)
+    # copy header files
    sh.cp("-f", glob.glob("mace/public/*.h"), model_header_dir)
-    if not os.path.exists(model_data_dir):
-        sh.mkdir("-p", model_data_dir)
    if hexagon_mode:
        sh.cp("-f", hexagon_lib_file, model_bin_dir)

-    if model_load_type == "source":
+    if model_build_type == BuildType.code:
        sh.cp("-f", glob.glob("mace/codegen/engine/*.h"), model_header_dir)
+        sh.cp("-f", glob.glob("mace/codegen/models/*/*.h"), model_header_dir)

-    for model_output_dir in model_output_dirs:
-        if not embed_model_data:
-            sh.cp("-f", glob.glob("%s/*.data" % model_output_dir),
-                  model_data_dir)
-        if model_load_type == "source":
-            sh.cp("-f", glob.glob("%s/*.h" % model_output_dir),
-                  model_header_dir)
-
-    for model_name in mace_model_dirs_kv:
-        sh.cp("-f", "%s/%s.pb" % (mace_model_dirs_kv[model_name], model_name),
-              model_data_dir)
-
+    # make static library
    mri_stream = ""
    if abi == "host":
        mri_stream += "create %s/libmace_%s.a\n" % \
@@ -907,21 +881,39 @@ def merge_libs(target_soc,
        mri_stream += (
            "addlib "
            "bazel-bin/mace/codegen/libgenerated_tuning_params.pic.a\n")
-        if model_load_type == "source":
+        mri_stream += (
+            "addlib "
+            "bazel-bin/mace/codegen/libgenerated_version.pic.a\n")
+        mri_stream += (
+            "addlib "
+            "bazel-bin/mace/core/libcore.pic.a\n")
+        mri_stream += (
+            "addlib "
+            "bazel-bin/mace/kernels/libkernels.pic.a\n")
+        mri_stream += (
+            "addlib "
+            "bazel-bin/mace/utils/libutils.pic.a\n")
+        mri_stream += (
+            "addlib "
+            "bazel-bin/mace/proto/libmace_cc.pic.a\n")
+        mri_stream += (
+            "addlib "
+            "bazel-bin/external/com_google_protobuf/libprotobuf_lite.pic.a\n")
+        mri_stream += (
+            "addlib "
+            "bazel-bin/mace/ops/libops.pic.lo\n")
+        if model_build_type == BuildType.code:
            mri_stream += (
                "addlib "
                "bazel-bin/mace/codegen/libgenerated_models.pic.a\n")
-        else:
-            mri_stream += (
-                "addlib "
-                "bazel-bin/mace/core/libcore.pic.a\n")
-            mri_stream += (
-                "addlib "
-                "bazel-bin/mace/ops/libops.pic.lo\n")
    else:
-        mri_stream += "create %s/libmace_%s.%s.a\n" % \
-                      (model_bin_dir, project_name, target_soc)
-        if model_load_type == "source":
+        if not target_soc:
+            mri_stream += "create %s/libmace_%s.a\n" % \
+                          (model_bin_dir, project_name)
+        else:
+            mri_stream += "create %s/libmace_%s.%s.a\n" % \
+                          (model_bin_dir, project_name, target_soc)
+        if model_build_type == BuildType.code:
            mri_stream += (
                "addlib "
                "bazel-bin/mace/codegen/libgenerated_models.a\n")
@@ -943,9 +935,6 @@ def merge_libs(target_soc,
        mri_stream += (
            "addlib "
            "bazel-bin/mace/utils/libutils.a\n")
-        mri_stream += (
-            "addlib "
-            "bazel-bin/mace/utils/libutils_prod.a\n")
        mri_stream += (
            "addlib "
            "bazel-bin/mace/proto/libmace_cc.a\n")
@@ -985,7 +974,7 @@ def packaging_lib(libmace_output_dir, project_name):
            "%s" % tar_package_path,
            glob.glob("%s/*" % project_dir),
            "--exclude",
-            "%s/build" % project_dir,
+            "%s/_tmp" % project_dir,
            _out=process_output,
            _bg=True,
            _err_to_out=True)
@@ -994,21 +983,15 @@ def packaging_lib(libmace_output_dir, project_name):


 def build_benchmark_model(abi,
-                          embed_model_data,
                          model_output_dir,
-                          model_tag,
                          hexagon_mode):
    benchmark_binary_file = "%s/benchmark_model" % model_output_dir
    if os.path.exists(benchmark_binary_file):
        sh.rm("-rf", benchmark_binary_file)
-    if not embed_model_data:
-        sh.cp("-f", "mace/codegen/models/%s/%s.data" % (model_tag, model_tag),
-              model_output_dir)

    benchmark_target = "//mace/benchmark:benchmark_model"
    bazel_build(benchmark_target,
                abi=abi,
-                production_mode=True,
                hexagon_mode=hexagon_mode)

    target_bin = "/".join(bazel_target_to_bin(benchmark_target))
@@ -1017,6 +1000,7 @@ def build_benchmark_model(abi,

 def benchmark_model(abi,
                    serialno,
+                    benchmark_binary_dir,
                    vlog_level,
                    embed_model_data,
                    model_output_dir,
@@ -1028,6 +1012,7 @@ def benchmark_model(abi,
                    model_tag,
                    device_type,
                    phone_data_dir,
+                    build_type,
                    omp_num_threads=-1,
                    cpu_affinity_policy=1,
                    gpu_perf_hint=3,
@@ -1037,23 +1022,22 @@ def benchmark_model(abi,

    stdout_buff = []
    process_output = make_output_processor(stdout_buff)
+    mace_model_path = ""
+    if build_type == BuildType.proto:
+        mace_model_path = "%s/%s.pb" % (mace_model_dir, model_tag)
    if abi == "host":
-        if mace_model_dir:
-            mace_model_path = "%s/%s.pb" % (mace_model_dir, model_tag)
-        else:
-            mace_model_path = ""
        p = subprocess.Popen(
            [
                "env",
                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
-                "%s/benchmark_model" % model_output_dir,
+                "%s/benchmark_model" % benchmark_binary_dir,
                "--model_name=%s" % model_tag,
                "--input_node=%s" % ",".join(input_nodes),
                "--output_node=%s" % ",".join(output_nodes),
                "--input_shape=%s" % ":".join(input_shapes),
                "--output_shape=%s" % ":".join(output_shapes),
                "--input_file=%s/%s" % (model_output_dir, input_file_name),
-                "--model_data_file=%s/%s.data" % (model_output_dir, model_tag),
+                "--model_data_file=%s/%s.data" % (mace_model_dir, model_tag),
                "--device=%s" % device_type,
                "--omp_num_threads=%s" % omp_num_threads,
                "--cpu_affinity_policy=%s" % cpu_affinity_policy,
@@ -1072,18 +1056,17 @@ def benchmark_model(abi,
                                                        input_name)
            adb_push("%s/%s" % (model_output_dir, formatted_name),
                     phone_data_dir, serialno)
-        adb_push("%s/benchmark_model" % model_output_dir, phone_data_dir,
-                 serialno)
        if not embed_model_data:
-            adb_push("%s/%s.data" % (model_output_dir, model_tag),
+            adb_push("%s/%s.data" % (mace_model_dir, model_tag),
                     phone_data_dir, serialno)
-        if mace_model_dir:
-            mace_model_path = "%s/%s.pb" % (phone_data_dir, model_tag)
-            adb_push("%s/%s.pb" % (mace_model_dir, model_tag),
-                     mace_model_path,
+        mace_model_phone_path = ""
+        if build_type == BuildType.proto:
+            mace_model_phone_path = "%s/%s.pb" % (phone_data_dir, model_tag)
+            adb_push(mace_model_path,
+                     mace_model_phone_path,
                     serialno)
-        else:
-            mace_model_path = ""
+        adb_push("%s/benchmark_model" % benchmark_binary_dir, phone_data_dir,
+                 serialno)

        p = sh.adb(
            "-s",
@@ -1108,7 +1091,7 @@ def benchmark_model(abi,
            "--cpu_affinity_policy=%s" % cpu_affinity_policy,
            "--gpu_perf_hint=%s" % gpu_perf_hint,
            "--gpu_priority_hint=%s" % gpu_priority_hint,
-            "--model_file=%s" % mace_model_path,
+            "--model_file=%s" % mace_model_phone_path,
            _out=process_output,
            _bg=True,
            _err_to_out=True)

--- a/tools/validate.py
+++ b/tools/validate.py
@@ -36,6 +36,8 @@ import common
 #        --input_shape 1,64,64,3 \
 #        --output_shape 1,64,64,2

+VALIDATION_MODULE = 'VALIDATION'
+

 def load_data(file):
    if os.path.isfile(file):
@@ -51,18 +53,21 @@ def compare_output(platform, device_type, output_name, mace_out_value,
        mace_out_value = mace_out_value.reshape(-1)
        assert len(out_value) == len(mace_out_value)
        similarity = (1 - spatial.distance.cosine(out_value, mace_out_value))
-        print output_name, 'MACE VS', platform.upper(
-        ), 'similarity: ', similarity
+        common.MaceLogger.summary(
+            output_name + ' MACE VS ' + platform.upper()
+            + ' similarity: ' + str(similarity))
        if (device_type == "CPU" and similarity > 0.999) or \
            (device_type == "GPU" and similarity > 0.995) or \
                (device_type == "HEXAGON" and similarity > 0.930):
-            print '===================Similarity Test Passed=================='
+            common.MaceLogger.summary(
+                common.StringFormatter.block("Similarity Test Passed"))
        else:
-            print '===================Similarity Test Failed=================='
-            sys.exit(-1)
+            common.MaceLogger.error(
+                "", common.StringFormatter.block("Similarity Test Failed"))
    else:
-        print '=======================Skip empty node==================='
-        sys.exit(-1)
+        common.MaceLogger.error(
+            "", common.StringFormatter.block(
+                "Similarity Test failed because of empty output"))


 def normalize_tf_tensor_name(name):
@@ -76,8 +81,9 @@ def validate_tf_model(platform, device_type, model_file, input_file,
                      mace_out_file, input_names, input_shapes, output_names):
    import tensorflow as tf
    if not os.path.isfile(model_file):
-        print("Input graph file '" + model_file + "' does not exist!")
-        sys.exit(-1)
+        common.MaceLogger.error(
+            VALIDATION_MODULE,
+            "Input graph file '" + model_file + "' does not exist!")

    tf.reset_default_graph()
    input_graph_def = tf.GraphDef()
@@ -118,11 +124,13 @@ def validate_caffe_model(platform, device_type, model_file, input_file,
    os.environ['GLOG_minloglevel'] = '1'  # suprress Caffe verbose prints
    import caffe
    if not os.path.isfile(model_file):
-        print("Input graph file '" + model_file + "' does not exist!")
-        sys.exit(-1)
+        common.MaceLogger.error(
+            VALIDATION_MODULE,
+            "Input graph file '" + model_file + "' does not exist!")
    if not os.path.isfile(weight_file):
-        print("Input weight file '" + weight_file + "' does not exist!")
-        sys.exit(-1)
+        common.MaceLogger.error(
+            VALIDATION_MODULE,
+            "Input weight file '" + weight_file + "' does not exist!")

    caffe.set_mode_cpu()