Refactor mace_tools and yaml format for better usage.

ced4a49d · liuqi · fe0cdf27 · ced4a49d · ced4a49d · ced4a49d
30 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -68,4 +68,5 @@ platform_compitable_tests:
  stage: platform_compitable_tests
  script:
    - mkdir -p mace/codegen/version && bash mace/tools/git/gen_version_source.sh mace/codegen/version/version.cc
+    - mkdir -p mace/codegen/tuning && python mace/python/tools/binary_codegen.py --output_path=mace/codegen/tuning/tuning_params.cc
    - bazel build mace/core:core
--- a/docs/getting_started/create_a_model_deployment.rst
+++ b/docs/getting_started/create_a_model_deployment.rst
@@ -46,14 +46,14 @@ Configurations
      - The SHA256 checksum of the model file
    * - weight_sha256_checksum
      - The SHA256 checksum of the weight file, used by Caffe model
-    * - input_nodes
-      - The input node names, one or more strings
-    * - output_nodes
-      - The output node names, one or more strings
+    * - input_tensors
+      - The input tensor names (tensorflow), top name of inputs' layer (caffe). one or more strings
+    * - output_tensors
+      - The output tensor names (tensorflow), top name of outputs' layer (caffe). one or more strings
    * - input_shapes
-      - The shapes of the input nodes, in NHWC order
+      - The shapes of the input tensors, in NHWC order
    * - output_shapes
-      - The shapes of the output nodes, in NHWC order
+      - The shapes of the output tensors, in NHWC order
    * - runtime
      - The running device, one of CPU, GPU or DSP
    * - limit_opencl_kernel_time

--- a/docs/getting_started/how_to_build.rst
+++ b/docs/getting_started/how_to_build.rst
@@ -126,6 +126,7 @@ Tool <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/grap
            strip_unused_nodes(type=float, shape="1,64,64,3")
            remove_nodes(op=Identity, op=CheckNumerics)
            fold_constants(ignore_errors=true)
+            flatten_atrous_conv
            fold_batch_norms
            fold_old_batch_norms
            strip_unused_nodes
@@ -171,38 +172,110 @@ Caffe目前只支持最新版本，旧版本请使用Caffe的工具进行升级

 3.2 运行\ ``tools/mace_tools.py``\ 脚本

+**Commands**
+
+    **build**
+
+        .. note::
+
+            build模型静态库以及测试工具。
+
+        * *--config* (type=str,  default="",  required)：模型配置yaml文件路径.
+        * *--tuning* (optional)：是否为特定SOC调制GPU参数.
+        * *--enable_openmp* (optional)：是否启用openmp.
+
+    **run**
+
+        .. note::
+
+            命令行运行模型
+
+        * *--config* (type=str,  default="",  required)：模型配置yaml文件路径.
+        * *--round* (type=int, default=1,  optional)：模型运行次数。
+        * *--validate* (optional): 是否需要验证运行结果与框架运行结果是否一致。
+        * *--caffe_env* (type=local/docker, default=docker,  optional)：当vaildate时，可以选择指定caffe环境,local表示本地，docker表示使用docker容器.
+        * *--restart_round* (type=int, default=1,  optional)：模型重启次数。
+        * *--check_gpu_out_of_memory* (optional): 是否需要检查gpu内存越界。
+        * *--vlog_level* (type=int[0-5], default=0,  optional)：详细日志级别.
+
+        .. warning::
+
+            run依赖于build命令.build完成以后才可以执行run命令
+
+    **benchmark**
+        * *--config* (type=str,  default="",  required)：模型配置yaml文件路径.
+
+        .. warning::
+
+            benchmark依赖于build命令.
+
+    **通用参数**
+
+    .. list-table::
+        :widths: auto
+        :header-rows: 1
+        :align: left
+
+        * - argument(key)
+          - argument(value)
+          - default
+          - required
+          - commands
+          - explanation
+        * - --omp_num_threads
+          - int
+          - -1
+          - N
+          - run/benchmark
+          - number of threads
+        * - --cpu_affinity_policy
+          - int
+          - 1
+          - N
+          - run/benchmark
+          - 0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY
+        * - --gpu_perf_hint
+          - int
+          - 3
+          - N
+          - run/benchmark
+          - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH
+        * - --gpu_perf_hint
+          - int
+          - 3
+          - N
+          - run/benchmark
+          - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH
+        * - --gpu_priority_hint
+          - int
+          - 3
+          - N
+          - run/benchmark
+          - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH
+
 .. code:: sh

    # print help message
-    # python tools/mace_tools.py --help
-    # --config 配置文件的路径
-    # --output_dir 编译结果的输出文件目录，默认为`./build`
-    # --round 调用`examples/mace_run`运行模型的次数，默认为`1`
-    # --tuning 对opencl的参数调参，该项通常只有开发人员用到，默认为`true`
-    # --mode 运行模式，包含build/run/validate/merge/all/benchmark，默认为`all`
+    python tools/mace_tools.py -h
+    python tools/mace_tools.py build -h
+    python tools/mace_tools.py run -h
+    python tools/mace_tools.py benchmark -h

    # 仅编译模型和生成静态库
-    python tools/mace_tools.py --config=models/config.yaml --mode=build
+    python tools/mace_tools.py build --config=models/config.yaml

    # 测试模型的运行时间
-    python tools/mace_tools.py --config=models/config.yaml --mode=run --round=1000
+    python tools/mace_tools.py run --config=models/config.yaml --round=100

    # 对比编译好的模型在mace上与直接使用tensorflow或者caffe运行的结果，相似度使用`余弦距离表示`
    # 其中使用OpenCL设备，默认相似度大于等于`0.995`为通过；DSP设备下，相似度需要达到`0.930`。
-    python tools/mace_tools.py --config=models/config.yaml --mode=run --round=1000
-
-    # 将已编译好的多个模型合并成静态库
-    # 比如编译了8个模型，决定使用其中2个模型，这时候可以不重新build，直接修改全局配置文件，合并生成静态库
-    python tools/mace_tools.py --config=models/config.yaml --mode=merge
-
-    # 运行以上所有项（可用于测试速度，建议 round=20）
-    python tools/mace_tools.py --config=models/config.yaml --mode=all --round=1000
+    python tools/mace_tools.py run --config=models/config.yaml --validate

    # 模型Benchmark：查看每个Op的运行时间
-    python tools/mace_tools.py --config=models/config.yaml --mode=benchmark
+    python tools/mace_tools.py benchmark --config=models/config.yaml

    # 查看模型运行时占用内存（如果有多个模型，可能需要注释掉一部分配置，只剩一个模型的配置）
-    python tools/mace_tools.py --config=models/config.yaml --mode=run --round=10000 &
+    python tools/mace_tools.py run --config=models/config.yaml --round=10000 &
    adb shell dumpsys meminfo | grep mace_run
    sleep 10
    kill %1
@@ -211,21 +284,34 @@ Caffe目前只支持最新版本，旧版本请使用Caffe的工具进行升级

 通过前面的步骤，我们得到了包含业务模型的库文件。在业务代码中，我们只需要引入下面3组文件（\ ``./build/``\ 是默认的编译结果输出目录）：

-头文件(包含mace.h和各个模型的头文件)： \*
-``./build/${project_name}/${target_abi}/include/mace/public/*.h``
+**头文件**
+    * ``./build/${library_name}/include/mace/public/*.h``
+
+**静态库**
+    * ``./build/${library_name}/library/${target_abi}/*.a``
+
+**动态库**
+    * ``./build/${library_name}/library/${target_abi}/libhexagon_controller.so``
+
+    .. note::
+
+        仅编译的模型中包含dsp模式时用到
+
+**模型文件**
+    * ``./build/${library_name}/model/${MODEL_TAG}.pb``
+    * ``./build/${library_name}/model/${MODEL_TAG}.data``
+
+    .. note::

-静态库（包含mace engine、opencl和模型相关库）： \*
-``./build/${project_name}/${target_abi}/*.a``
+        pb文件紧当模型build_type设置为proto时才会产生。

-动态库（仅编译的模型中包含dsp模式时用到）： \*
-``./build/${project_name}/${target_abi}/libhexagon_controller.so``

-模型数据文件（仅在EMBED\_MODEL\_DATA=0时产生）： \*
-``./build/${project_name}/data/${MODEL_TAG}.data``
+**库文件tar包**
+    * ``./build/${library_name}/libmace_${library_name}.tar.gz``

-编译过程中间文件： \* ``./build/${project_name}/build/``
+    .. note::

-库文件tar包： \* ``./build/${project_name}/${project_name}.tar.gz``
+        该文件包含了上述所有文件，可以发布使用。

 5. 使用


--- a/docs/getting_started/models/demo_app_models.yaml
+++ b/docs/getting_started/models/demo_app_models.yaml
-# 配置文件名会被用作生成库的名称：libmace-${filename}.a
+# 库的名字
+library_name: library_name
+# 配置文件名会被用作生成库的名称：libmace-${library_name}.a
 target_abis: [armeabi-v7a, arm64-v8a]
 # 具体机型的soc编号，可以使用`adb shell getprop | grep ro.board.platform | cut -d [ -f3 | cut -d ] -f1`获取
 target_socs: [msm8998]
 embed_model_data: 1
+build_type: code # 模型build类型。code表示将模型转为代码，proto表示将模型转为protobuf文件
 models: # 一个配置文件可以包含多个模型的配置信息，最终生成的库中包含多个模型
-  first_net: # 模型的标签，在调度模型的时候，会用这个变量
+  first_net: # 模型的标签，在调度模型的时候，会用这个变量，必须唯一
    platform: tensorflow
    model_file_path: path/to/model64.pb # also support http:// and https://
    model_sha256_checksum: 7f7462333406e7dea87222737590ebb7d94490194d2f21a7d72bafa87e64e9f9
-    input_nodes: input_node
-    output_nodes: output_node
-    input_shapes: 1,64,64,3
-    output_shapes: 1,64,64,2
+    subgraphs:
+      - input_tensors: input_node
+        input_shapes: 1,64,64,3
+        output_tensors: output_node
+        output_shapes: 1,64,64,2
    runtime: gpu
+    data_type: fp16_fp32
    limit_opencl_kernel_time: 0
-    dsp_mode: 0
+    nnlib_graph_mode: 0
    obfuscate: 1
-    fast_conv: 0
+    winograd: 0
    input_files:
      - path/to/input_files # support http://
  second_net:
@@ -25,22 +30,23 @@ models: # 一个配置文件可以包含多个模型的配置信息，最终生
    weight_file_path: path/to/weight.caffemodel
    model_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
    weight_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
-    input_nodes:
-      - input_node0
-      - input_node1
-    output_nodes:
-      - output_node0
-      - output_node1
-    input_shapes:
-      - 1,256,256,3
-      - 1,128,128,3
-    output_shapes:
-      - 1,256,256,2
-      - 1,1,1,2
+    subgraphs:
+      - input_tensors:
+          - input_node0
+          - input_node1
+        input_shapes:
+          - 1,256,256,3
+          - 1,128,128,3
+        output_tensors:
+          - output_node0
+          - output_node1
+        output_shapes:
+          - 1,256,256,2
+          - 1,1,1,2
    runtime: cpu
    limit_opencl_kernel_time: 1
-    dsp_mode: 0
+    nnlib_graph_mode: 0
    obfuscate: 1
-    fast_conv: 0
+    winograd: 0
    input_files:
      - path/to/input_files # support http://
--- a/mace/benchmark/BUILD
+++ b/mace/benchmark/BUILD
@@ -2,8 +2,6 @@
 # Examples
 load(
    "//mace:mace.bzl",
-    "if_production_mode",
-    "if_not_production_mode",
    "if_hexagon_enabled",
    "if_openmp_enabled",
    "if_android",

--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -12,8 +12,6 @@ load(
    "if_android",
    "if_hexagon_enabled",
    "if_not_hexagon_enabled",
-    "if_production_mode",
-    "if_not_production_mode",
    "if_openmp_enabled",
    "if_neon_enabled",
 )
@@ -63,17 +61,13 @@ cc_library(
    ]),
    deps = [
        "//mace/codegen:generated_version",
+        "//mace/codegen:generated_tuning_params",
        "//mace/proto:mace_cc",
        "//mace/utils",
    ] + if_android([
        ":opencl_headers",
        "//mace/codegen:generated_opencl",
        "@half//:half",
-    ]) + if_production_mode([
-        "//mace/codegen:generated_tuning_params",
-        "//mace/utils:utils_prod",
-    ]) + if_not_production_mode([
-        "//mace/utils:utils_dev",
    ]) + if_hexagon_enabled([
        "//third_party/nnlib:libhexagon",
    ]),

--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -284,16 +284,16 @@ const unsigned char *LoadModelData(const std::string &model_data_file,
                                   const size_t &data_size) {
  int fd = open(model_data_file.c_str(), O_RDONLY);
  MACE_CHECK(fd >= 0, "Failed to open model data file ",
-             model_data_file, ", error code: ", errno);
+             model_data_file, ", error code: ", strerror(errno));

  const unsigned char *model_data = static_cast<const unsigned char *>(
      mmap(nullptr, data_size, PROT_READ, MAP_PRIVATE, fd, 0));
  MACE_CHECK(model_data != MAP_FAILED, "Failed to map model data file ",
-             model_data_file, ", error code: ", errno);
+             model_data_file, ", error code: ", strerror(errno));

  int ret = close(fd);
  MACE_CHECK(ret == 0, "Failed to close model data file ",
-             model_data_file, ", error code: ", errno);
+             model_data_file, ", error code: ", strerror(errno));

  return model_data;
 }
@@ -302,7 +302,8 @@ void UnloadModelData(const unsigned char *model_data,
                     const size_t &data_size) {
  int ret = munmap(const_cast<unsigned char *>(model_data),
                   data_size);
-  MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ", errno);
+  MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ",
+             strerror(errno));
 }

 MaceStatus CreateMaceEngineFromProto(

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -215,6 +215,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
      (*kernel_error)->UnMap();
    }
    if (runtime->is_profiling_enabled()) {
+      event.wait();
      CallStats tmp_stats;
      runtime->GetCallStats(event, &tmp_stats);
      call_stats.start_micros =
@@ -223,8 +224,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
    }
  }
  if (future != nullptr) {
-    future->wait_fn = [runtime, event, call_stats](CallStats *stats) {
-      event.wait();
+    future->wait_fn = [runtime, call_stats](CallStats *stats) {
      if (stats != nullptr) {
        stats->start_micros = call_stats.start_micros;
        stats->end_micros = stats->start_micros + call_stats.end_micros;

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -209,8 +209,9 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
 std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
                                       const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
-  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  uint64_t cache_size =
+      OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
  lws[2] =
      std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);

--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -115,6 +115,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
      kernel_error_->UnMap();
    }
    if (runtime->is_profiling_enabled()) {
+      event.wait();
      CallStats tmp_stats;
      runtime->GetCallStats(event, &tmp_stats);
      call_stats.start_micros =
@@ -123,8 +124,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
    }
  }
  if (future != nullptr) {
-    future->wait_fn = [runtime, event, call_stats](CallStats *stats) {
-      event.wait();
+    future->wait_fn = [runtime, call_stats](CallStats *stats) {
      if (stats != nullptr) {
        stats->start_micros = call_stats.start_micros;
        stats->end_micros = stats->start_micros + call_stats.end_micros;

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -24,18 +24,6 @@ def if_android_arm64(a):
      "//conditions:default": [],
  })

-def if_production_mode(a):
-  return select({
-      "//mace:production_mode": a,
-      "//conditions:default": [],
-  })
-
-def if_not_production_mode(a):
-  return select({
-      "//mace:production_mode": [],
-      "//conditions:default": a,
-  })
-
 def if_neon_enabled(a):
  return select({
      "//mace:neon_enabled": a,

--- a/mace/python/tools/BUILD
+++ b/mace/python/tools/BUILD
@@ -3,7 +3,6 @@ py_library(
    srcs = [
        "convert_util.py",
        "graph_util.py",
-        "tensor_util.py",
        "tf_dsp_converter_lib.py",
        "converter_tool/base_converter.py",
        "converter_tool/shape_inference.py",
@@ -20,9 +19,9 @@ py_library(
 )

 py_library(
-    name = "source_converter_lib",
+    name = "model_saver_lib",
    srcs = [
-        "source_converter_lib.py",
+        "model_saver.py",
    ],
    srcs_version = "PY2AND3",
    deps = [
@@ -45,7 +44,7 @@ py_binary(
    srcs_version = "PY2AND3",
    deps = [
        ":converter_lib",
-        ":source_converter_lib",
+        ":model_saver_lib",
        "@six_archive//:six",
    ],
 )
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -21,8 +21,7 @@ import copy
 from mace.proto import mace_pb2
 from mace.python.tools import tf_dsp_converter_lib
 from mace.python.tools import memory_optimizer
-from mace.python.tools import source_converter_lib
-from mace.python.tools import tensor_util
+from mace.python.tools import model_saver
 from mace.python.tools.converter_tool import base_converter as cvt
 from mace.python.tools.converter_tool import tensorflow_converter
 from mace.python.tools.converter_tool import caffe_converter
@@ -42,6 +41,20 @@ device_type_map = {'cpu': cvt.DeviceType.CPU.value,
                   'dsp': cvt.DeviceType.HEXAGON.value}


+def parse_data_type(data_type, device_type):
+    if device_type == cvt.DeviceType.GPU.value:
+        if data_type == 'fp32_fp32':
+            return mace_pb2.DT_FLOAT
+        else:
+            return mace_pb2.DT_HALF
+    elif device_type == cvt.DeviceType.CPU.value:
+        return mace_pb2.DT_FLOAT
+    elif device_type == cvt.DeviceType.HEXAGON.value:
+        return mace_pb2.DT_UINT8
+    else:
+        print("Invalid device type: " + device_type)
+
+
 def file_checksum(fname):
    hash_func = hashlib.sha256()
    with open(fname, "rb") as f:
@@ -82,7 +95,7 @@ def main(unused_args):
    if FLAGS.platform not in ['tensorflow', 'caffe']:
        print ("platform %s is not supported." % FLAGS.platform)
        sys.exit(-1)
-    if FLAGS.runtime not in ['cpu', 'gpu', 'dsp', '']:
+    if FLAGS.runtime not in ['cpu', 'gpu', 'dsp', 'cpu+gpu']:
        print ("runtime %s is not supported." % FLAGS.runtime)
        sys.exit(-1)

@@ -114,7 +127,6 @@ def main(unused_args):
            output_node.name = output_node_names[i]
            option.add_output_node(output_node)

-        print("Convert model to mace model.")
        if FLAGS.platform == 'tensorflow':
            converter = tensorflow_converter.TensorflowConverter(
                option, FLAGS.model_file)
@@ -122,24 +134,18 @@ def main(unused_args):
            converter = caffe_converter.CaffeConverter(option,
                                                       FLAGS.model_file,
                                                       FLAGS.weight_file)
+        else:
+            print("Mace do not support platorm %s yet." & FLAGS.platform)
+            exit(1)

        output_graph_def = converter.run()

-        if FLAGS.gpu_data_type == 'half':
-            gpu_data_type = mace_pb2.DT_HALF
-        else:
-            gpu_data_type = mace_pb2.DT_FLOAT
-        device_data_type_map = {
-            cvt.DeviceType.CPU.value: mace_pb2.DT_FLOAT,
-            cvt.DeviceType.GPU.value: gpu_data_type,
-            cvt.DeviceType.HEXAGON.value: mace_pb2.DT_UINT8
-        }
-
        print("Transform model to one that can better run on device")
-        if not FLAGS.runtime:
+        if FLAGS.runtime == 'cpu+gpu':
            cpu_graph_def = copy.deepcopy(output_graph_def)
            option.device = cvt.DeviceType.CPU.value
-            option.data_type = device_data_type_map[cvt.DeviceType.CPU.value]
+            option.data_type = parse_data_type(
+                FLAGS.data_type, cvt.DeviceType.CPU.value)
            option.disable_transpose_filters()
            mace_cpu_transformer = transformer.Transformer(
                option, cpu_graph_def)
@@ -149,7 +155,8 @@ def main(unused_args):
            print "CPU memory optimization done."

            option.device = cvt.DeviceType.GPU.value
-            option.data_type = device_data_type_map[cvt.DeviceType.GPU.value]
+            option.data_type = parse_data_type(
+                FLAGS.data_type, cvt.DeviceType.GPU.value)
            option.enable_transpose_filters()
            mace_gpu_transformer = transformer.Transformer(
                option, output_graph_def)
@@ -165,7 +172,8 @@ def main(unused_args):
            print "Merge done"
        else:
            option.device = device_type_map[FLAGS.runtime]
-            option.data_type = device_data_type_map[option.device]
+            option.data_type = parse_data_type(
+                FLAGS.data_type, option.device)
            mace_transformer = transformer.Transformer(
                option, output_graph_def)
            output_graph_def = mace_transformer.run()
@@ -180,36 +188,13 @@ def main(unused_args):

            print "Memory optimization done."

-    if FLAGS.obfuscate:
-        tensor_util.obfuscate_name(output_graph_def)
-    else:
-        tensor_util.rename_tensor(output_graph_def)
-
-    tensor_infos, model_data = tensor_util.get_tensor_info_and_model_data(
-            output_graph_def, FLAGS.runtime, FLAGS.gpu_data_type)
-
-    source_converter_lib.convert_to_source(
-            output_graph_def, model_checksum, weight_checksum, FLAGS.template,
-            FLAGS.obfuscate, FLAGS.model_tag, FLAGS.codegen_output,
-            FLAGS.runtime, FLAGS.embed_model_data, FLAGS.winograd,
-            FLAGS.model_load_type, tensor_infos, model_data)
-
-    if not FLAGS.embed_model_data:
-        output_dir = os.path.dirname(FLAGS.codegen_output) + '/'
-        with open(output_dir + FLAGS.model_tag + '.data', "wb") as f:
-            f.write(bytearray(model_data))
-
-    if FLAGS.model_load_type == 'pb':
-        tensor_util.del_tensor_data(
-                output_graph_def, FLAGS.runtime, FLAGS.gpu_data_type)
-        tensor_util.update_tensor_data_type(
-                output_graph_def, FLAGS.runtime, FLAGS.gpu_data_type)
-        with open(FLAGS.pb_output, "wb") as f:
-            f.write(output_graph_def.SerializeToString())
-        # with open(FLAGS.pb_output + '_txt', "wb") as f:
-        #     # output_graph_def.ClearField('tensors')
-        #     f.write(str(output_graph_def))
-    print("Model conversion is completed.")
+    model_saver.save_model(
+        output_graph_def, model_checksum, weight_checksum,
+        FLAGS.template_dir, FLAGS.obfuscate, FLAGS.model_tag,
+        FLAGS.output_dir, FLAGS.runtime,
+        FLAGS.embed_model_data,
+        FLAGS.winograd, FLAGS.data_type,
+        FLAGS.model_build_type)


 def str2bool(v):
@@ -244,15 +229,10 @@ def parse_args():
        default="",
        help="Weight file sha256 checksum")
    parser.add_argument(
-        "--codegen_output",
+        "--output_dir",
        type=str,
        default="",
        help="File to save the output graph to.")
-    parser.add_argument(
-        "--pb_output",
-        type=str,
-        default="",
-        help="File to save the mace model to.")
    parser.add_argument(
        "--runtime", type=str, default="", help="Runtime: cpu/gpu/dsp")
    parser.add_argument(
@@ -263,7 +243,7 @@ def parse_args():
    parser.add_argument(
        "--output_node", type=str, default="softmax", help="e.g., softmax")
    parser.add_argument(
-        "--template", type=str, default="", help="template path")
+        "--template_dir", type=str, default="", help="template path")
    parser.add_argument(
        "--obfuscate",
        type=str2bool,
@@ -295,13 +275,16 @@ def parse_args():
        default=True,
        help="embed model data.")
    parser.add_argument(
-        "--model_load_type",
+        "--model_build_type",
        type=str,
-        default="source",
-        help="[source|pb] Load models in generated `source` code" +
-                "or `pb` file.")
+        default="code",
+        help="[proto|code] build models to code" +
+                "or `Protobuf` file.")
    parser.add_argument(
-        "--gpu_data_type", type=str, default="half", help="half/float")
+        "--data_type",
+        type=str,
+        default="fp16_fp32",
+        help="fp16_fp32/fp32_fp32")
    return parser.parse_known_args()



--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -395,12 +395,6 @@ class TensorflowConverter(base_converter.ConverterInterface):
        align_corners_arg.i = tf_op.get_attr(tf_align_corners)

    def convert_space_batch(self, tf_op):
-        print """You might want to try 'flatten_atrous_conv' in
-         transform graph to turn atrous conv2d into regular conv2d.
-         This may give you performance benefit on GPU.
-         (see https://github.com/tensorflow/tensorflow/blob/master/
-         tensorflow/tools/graph_transforms/README.md#flatten_atrous_conv)
-         """

        op = self.convert_general_op(tf_op)
        del op.input[1:]

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -52,7 +52,7 @@ class Transformer(base_converter.ConverterInterface):
    """

    def __init__(self, option, model):
-        # DO NOT reorder the following transformers
+        # DO NOT reorder the following transformers' order
        self._registered_transformers_order = [
            TransformerRule.REMOVE_USELESS_RESHAPE_OP,
            TransformerRule.REMOVE_IDENTITY_OP,
@@ -940,8 +940,9 @@ class Transformer(base_converter.ConverterInterface):
            op_def.type = MaceKeyword.mace_image_to_buffer
            op_def.input.extend([output_node.name])
            op_def.output.extend([output_name])
-            output_shape = op_def.output_shape.add()
-            output_shape.dims.extend(output_node.shape)
+            if output_node.shape:
+                output_shape = op_def.output_shape.add()
+                output_shape.dims.extend(output_node.shape)

            arg = op_def.arg.add()
            arg.name = MaceKeyword.mace_buffer_type

--- a/mace/python/tools/encrypt_opencl_codegen.py
+++ b/mace/python/tools/encrypt_opencl_codegen.py
@@ -73,8 +73,6 @@ def encrypt_opencl_codegen(cl_kernel_dir, output_path):
    with open(output_path, "w") as w_file:
        w_file.write(cpp_cl_encrypted_kernel)

-    print("Generate encrypted opencl source done!")
-

 def parse_args():
    """Parses command line arguments."""

--- a/mace/python/tools/mace_engine_factory.h.jinja2
+++ b/mace/python/tools/mace_engine_factory.h.jinja2
@@ -25,7 +25,7 @@

 namespace mace {

-{% if model_type == 'source' %}
+{% if model_type == 'code' %}
 {% for tag in model_tags %}
 namespace {{tag}} {


--- a/mace/python/tools/mace_engine_factory_codegen.py
+++ b/mace/python/tools/mace_engine_factory_codegen.py
@@ -25,7 +25,6 @@ def gen_mace_engine_factory(model_tags, template_dir, model_type, output_dir):
    j2_env = Environment(
        loader=FileSystemLoader(template_dir), trim_blocks=True)
    # generate mace_run BUILD file
-    print model_tags
    template_name = 'mace_engine_factory.h.jinja2'
    source = j2_env.get_template(template_name).render(
        model_tags=model_tags,

--- a/mace/python/tools/tensor_util.py
+++ b/mace/python/tools/tensor_util.py
@@ -12,13 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import hashlib
+import datetime
+import os
+import uuid
 import numpy as np
+import hashlib
+from enum import Enum

 from mace.proto import mace_pb2
+from jinja2 import Environment, FileSystemLoader

 GENERATED_NAME = set()

+GPUDataTypeStrs = [
+    "fp16_fp32",
+    "fp32_fp32",
+]
+
+GPUDataType = \
+    Enum('GPUDataType', [(ele, ele) for ele in GPUDataTypeStrs], type=str)
+

 def generate_obfuscated_name(namespace, name):
    md5 = hashlib.md5()
@@ -104,70 +117,199 @@ def rename_tensor(net_def):
                op.output[i] = tensor_map[op.output[i]]


+def stringfy(value):
+    return ', '.join('"{0}"'.format(w) for w in value)
+
+
 class TensorInfo:
-    def __init__(self, id, t, runtime, gpu_data_type):
+    def __init__(self, id, tensor):
        self.id = id
-        self.data_type = mace_pb2.DataType.Name(t.data_type)
-        if t.data_type == mace_pb2.DT_FLOAT:
-            if runtime == 'gpu' and gpu_data_type == 'half':
-                self.data_type = mace_pb2.DT_HALF
-                self.data = bytearray(
-                    np.array(t.float_data).astype(np.float16).tobytes())
-            else:
-                self.data_type = mace_pb2.DT_FLOAT
-                self.data = bytearray(
-                    np.array(t.float_data).astype(np.float32).tobytes())
-        elif t.data_type == mace_pb2.DT_INT32:
+        self.data_type = tensor.data_type
+        if tensor.data_type == mace_pb2.DT_HALF:
+            self.data_type = mace_pb2.DT_HALF
            self.data = bytearray(
-                np.array(t.int32_data).astype(np.int32).tobytes())
-        elif t.data_type == mace_pb2.DT_UINT8:
+                np.array(tensor.float_data).astype(np.float16).tobytes())
+        elif tensor.data_type == mace_pb2.DT_FLOAT:
+            self.data_type = mace_pb2.DT_FLOAT
            self.data = bytearray(
-                np.array(t.int32_data).astype(np.uint8).tolist())
+                np.array(tensor.float_data).astype(np.float32).tobytes())
+        elif tensor.data_type == mace_pb2.DT_INT32:
+            self.data = bytearray(
+                np.array(tensor.int32_data).astype(np.int32).tobytes())
+        elif tensor.data_type == mace_pb2.DT_UINT8:
+            self.data = bytearray(
+                np.array(tensor.int32_data).astype(np.uint8).tolist())
        else:
-            raise Exception('Tensor data type %s not supported' % t.data_type)
+            raise Exception('Tensor data type %s not supported' %
+                            tensor.data_type)


-def get_tensor_info_and_model_data(net_def, runtime, gpu_data_type):
-    model_data = []
+def update_tensor_infos(net_def, runtime, data_type):
    offset = 0
    counter = 0
    tensor_infos = []
-    for t in net_def.tensors:
-        tensor_info = TensorInfo(counter, t, runtime, gpu_data_type)
+    for tensor in net_def.tensors:
+        # update data_type
+        if tensor.data_type == mace_pb2.DT_FLOAT and runtime == 'gpu' \
+                and data_type == GPUDataType.fp16_fp32:
+            tensor.data_type = mace_pb2.DT_HALF
+
+        # Add offset and data_size
+        tensor_info = TensorInfo(counter, tensor)
        tensor_infos.append(tensor_info)
        # align
        if tensor_info.data_type != 'DT_UINT8' and offset % 4 != 0:
            padding = 4 - offset % 4
-            model_data.extend(bytearray([0] * padding))
            offset += padding

-        if t.data_type == mace_pb2.DT_FLOAT:
-            t.data_size = len(t.float_data)
-        elif t.data_type == mace_pb2.DT_INT32:
-            t.data_size = len(t.int32_data)
-        elif t.data_type == mace_pb2.DT_UINT8:
-            t.data_size = len(t.int32_data)
-        t.offset = offset
-
+        if tensor.data_type == mace_pb2.DT_FLOAT \
+                or tensor.data_type == mace_pb2.DT_HALF:
+            tensor.data_size = len(tensor.float_data)
+        elif tensor.data_type == mace_pb2.DT_INT32:
+            tensor.data_size = len(tensor.int32_data)
+        elif tensor.data_type == mace_pb2.DT_UINT8:
+            tensor.data_size = len(tensor.int32_data)
+        tensor.offset = offset
+        offset += len(tensor_info.data)
        counter += 1
+
+
+def extract_model_data(net_def):
+    model_data = []
+    offset = 0
+    counter = 0
+    for tensor in net_def.tensors:
+        tensor_info = TensorInfo(counter, tensor)
+        # align
+        if tensor_info.data_type != mace_pb2.DT_UINT8 and offset % 4 != 0:
+            padding = 4 - offset % 4
+            model_data.extend(bytearray([0] * padding))
+            offset += padding
        model_data.extend(tensor_info.data)
        offset += len(tensor_info.data)
+        counter += 1
+    return model_data

-    return tensor_infos, model_data

+def save_model_data(net_def, model_tag, output_dir):
+    model_data = extract_model_data(net_def)
+    # generate tensor data
+    with open(output_dir + model_tag + '.data', "wb") as f:
+        f.write(bytearray(model_data))

-def del_tensor_data(net_def, runtime, gpu_data_type):
-    for t in net_def.tensors:
-        if t.data_type == mace_pb2.DT_FLOAT:
-            del t.float_data[:]
-        elif t.data_type == mace_pb2.DT_INT32:
-            del t.int32_data[:]
-        elif t.data_type == mace_pb2.DT_UINT8:
-            del t.int32_data[:]

+def save_model_to_proto(net_def, model_tag, output_dir):
+    for tensor in net_def.tensors:
+        if tensor.data_type == mace_pb2.DT_FLOAT \
+                or tensor.data_type == mace_pb2.DT_HALF:
+            del tensor.float_data[:]
+        elif tensor.data_type == mace_pb2.DT_INT32:
+            del tensor.int32_data[:]
+        elif tensor.data_type == mace_pb2.DT_UINT8:
+            del tensor.int32_data[:]
+    proto_file_path = output_dir + model_tag + '.pb'
+    with open(proto_file_path, "wb") as f:
+        f.write(net_def.SerializeToString())
+    with open(proto_file_path + '_txt', "wb") as f:
+        f.write(str(net_def))

-def update_tensor_data_type(net_def, runtime, gpu_data_type):
-    for t in net_def.tensors:
-        if t.data_type == mace_pb2.DT_FLOAT and runtime == 'gpu' \
-                and gpu_data_type == 'half':
-            t.data_type = mace_pb2.DT_HALF
+
+def save_model_to_code(net_def, model_tag, runtime,
+                       template_dir, output_dir, embed_model_data,
+                       model_checksum, weight_checksum,
+                       obfuscate, winograd_conv):
+    # Create the jinja2 environment.
+    j2_env = Environment(
+        loader=FileSystemLoader(template_dir), trim_blocks=True)
+    j2_env.filters['stringfy'] = stringfy
+
+    # generate tensor source files
+    template_name = 'tensor_source.jinja2'
+
+    counter = 0
+    for tensor in net_def.tensors:
+        tensor_info = TensorInfo(counter, tensor)
+        # convert tensor
+        source = j2_env.get_template(template_name).render(
+            tensor_info=tensor_info,
+            tensor=tensor,
+            tag=model_tag,
+        )
+        with open(output_dir + 'tensor' + str(counter) + '.cc', "wb") as f:
+            f.write(source)
+        counter += 1
+
+    # generate tensor data
+    model_data = extract_model_data(net_def)
+    template_name = 'tensor_data.jinja2'
+    source = j2_env.get_template(template_name).render(
+        tag=model_tag,
+        embed_model_data=embed_model_data,
+        model_data_size=len(model_data),
+        model_data=model_data)
+    with open(output_dir + 'tensor_data' + '.cc', "wb") as f:
+        f.write(source)
+
+    # generate op source files
+    template_name = 'operator.jinja2'
+    counter = 0
+    op_size = len(net_def.op)
+    for start in range(0, op_size, 10):
+        source = j2_env.get_template(template_name).render(
+            start=start,
+            end=min(start + 10, op_size),
+            net=net_def,
+            tag=model_tag,
+            runtime=runtime,
+        )
+        with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f:
+            f.write(source)
+        counter += 1
+
+    # generate model source files
+    build_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    template_name = 'model.jinja2'
+    checksum = model_checksum
+    if weight_checksum is not None:
+        checksum = "{},{}".format(model_checksum, weight_checksum)
+    source = j2_env.get_template(template_name).render(
+        net=net_def,
+        tag=model_tag,
+        runtime=runtime,
+        obfuscate=obfuscate,
+        embed_model_data=embed_model_data,
+        winograd_conv=winograd_conv,
+        checksum=checksum,
+        build_time=build_time)
+    with open(output_dir + 'model.cc', "wb") as f:
+        f.write(source)
+
+    # generate model header file
+    template_name = 'model_header.jinja2'
+    source = j2_env.get_template(template_name).render(tag=model_tag, )
+    with open(output_dir + model_tag + '.h', "wb") as f:
+        f.write(source)
+
+
+def save_model(net_def, model_checksum, weight_checksum, template_dir,
+               obfuscate, model_tag, output_dir, runtime, embed_model_data,
+               winograd_conv, data_type, model_build_type):
+    if obfuscate:
+        obfuscate_name(net_def)
+    else:
+        rename_tensor(net_def)
+
+    output_dir = output_dir + '/'
+    # update tensor type
+    update_tensor_infos(net_def, runtime, data_type)
+
+    if model_build_type == 'proto' or not embed_model_data:
+        save_model_data(net_def, model_tag, output_dir)
+
+    if model_build_type == 'proto':
+        save_model_to_proto(net_def, model_tag, output_dir)
+    else:
+        save_model_to_code(net_def, model_tag, runtime,
+                           template_dir, output_dir, embed_model_data,
+                           model_checksum, weight_checksum,
+                           obfuscate, winograd_conv)
--- a/mace/python/tools/operator.jinja2
+++ b/mace/python/tools/operator.jinja2
@@ -94,10 +94,11 @@ void CreateOperator{{i}}(mace::OperatorDef *op) {

  {% endfor %}

+  {% if net.op[i].output_shape|length > 0 %}
  op->mutable_output_shape()->Reserve({{ net.op[i].output_shape|length }});
  mace::OutputShape * output_shape = nullptr;
  {% for shape in net.op[i].output_shape %}
-	{% if shape.dims|length > 0 %}
+  {% if shape.dims|length > 0 %}
  output_shape = op->add_output_shape();

  output_shape->mutable_dims()->Reserve({{ shape.dims|length }});
@@ -105,8 +106,9 @@ void CreateOperator{{i}}(mace::OperatorDef *op) {
  output_shape->add_dims({{ dim }});
  {% endfor %}

-	{% endif %}
+  {% endif %}
  {% endfor %}
+  {% endif %}

  std::vector<int> output_types_int({ {{ net.op[i].output_type | join(', ') }} });
  std::vector<mace::DataType> output_types({{ net.op[i].output_type | length }});

--- a/mace/python/tools/source_converter_lib.py
+++ b/mace/python/tools/source_converter_lib.py
-# Copyright 2018 Xiaomi, Inc.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import datetime
-import os
-
-from mace.proto import mace_pb2
-from jinja2 import Environment, FileSystemLoader
-
-
-def stringfy(value):
-    return ', '.join('"{0}"'.format(w) for w in value)
-
-
-def convert_to_source(net_def, model_checksum, weight_checksum, template_dir,
-                      obfuscate, model_tag, output, runtime, embed_model_data,
-                      winograd_conv, model_load_type, tensor_infos,
-                      model_data):
-    # Capture our current directory
-    print template_dir
-
-    # Create the jinja2 environment.
-    j2_env = Environment(
-        loader=FileSystemLoader(template_dir), trim_blocks=True)
-    j2_env.filters['stringfy'] = stringfy
-    output_dir = os.path.dirname(output) + '/'
-    # generate tensor source files
-    template_name = 'tensor_source.jinja2'
-    for i in range(len(net_def.tensors)):
-        if model_load_type == 'source':
-            source = j2_env.get_template(template_name).render(
-                tensor_info=tensor_infos[i],
-                tensor=net_def.tensors[i],
-                tag=model_tag,
-            )
-            with open(output_dir + 'tensor' + str(i) + '.cc', "wb") as f:
-                f.write(source)
-
-    if model_load_type == 'source':
-        # generate tensor data
-        template_name = 'tensor_data.jinja2'
-        source = j2_env.get_template(template_name).render(
-            tag=model_tag,
-            embed_model_data=embed_model_data,
-            model_data_size=len(model_data),
-            model_data=model_data)
-        with open(output_dir + 'tensor_data' + '.cc', "wb") as f:
-            f.write(source)
-
-        # generate op source files
-        template_name = 'operator.jinja2'
-        counter = 0
-        op_size = len(net_def.op)
-        for start in range(0, op_size, 10):
-            source = j2_env.get_template(template_name).render(
-                start=start,
-                end=min(start + 10, op_size),
-                net=net_def,
-                tag=model_tag,
-                runtime=runtime,
-            )
-            with open(output_dir + 'op' + str(counter) + '.cc', "wb") as f:
-                f.write(source)
-            counter += 1
-
-        # generate model source files
-        build_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-        template_name = 'model.jinja2'
-        checksum = model_checksum
-        if weight_checksum is not None:
-            checksum = "{},{}".format(model_checksum, weight_checksum)
-        source = j2_env.get_template(template_name).render(
-            net=net_def,
-            tag=model_tag,
-            runtime=runtime,
-            obfuscate=obfuscate,
-            embed_model_data=embed_model_data,
-            winograd_conv=winograd_conv,
-            checksum=checksum,
-            build_time=build_time)
-        with open(output, "wb") as f:
-            f.write(source)
-
-        # generate model header file
-        template_name = 'model_header.jinja2'
-        source = j2_env.get_template(template_name).render(tag=model_tag, )
-        with open(output_dir + model_tag + '.h', "wb") as f:
-            f.write(source)
--- a/mace/utils/BUILD
+++ b/mace/utils/BUILD
@@ -31,28 +31,6 @@ cc_library(
    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
    deps = [
        "//mace/public",
-    ],
-)
-
-cc_library(
-    name = "utils_dev",
-    srcs = [
-        "tuner_development.cc",
-    ],
-    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
-    deps = [
-        ":utils",
-    ],
-)
-
-cc_library(
-    name = "utils_prod",
-    srcs = [
-        "tuner_production.cc",
-    ],
-    copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
-    deps = [
-        ":utils",
        "//mace/codegen:generated_tuning_params",
    ],
 )
@@ -70,7 +48,7 @@ cc_test(
    ]),
    linkstatic = 1,
    deps = [
-        ":utils_dev",
+        ":utils",
        "@gtest//:gtest",
        "@gtest//:gtest_main",
    ],

--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -29,10 +29,6 @@

 namespace mace {

-extern bool GetTuningParams(
-    const char *path,
-    std::unordered_map<std::string, std::vector<unsigned int>> *param_table);
-
 template <typename param_type>
 class Tuner {
 public:
@@ -74,9 +70,6 @@ class Tuner {
                        : "");
        return func(param_table_[obfucated_param_key], nullptr, nullptr);
      } else {
-#ifndef MACE_DISABLE_NO_TUNING_WARNING
-        LOG(WARNING) << "Fallback to default parameter: " << param_key;
-#endif
        return func(default_param, nullptr, nullptr);
      }
    }
@@ -124,9 +117,16 @@ class Tuner {
  }

  inline void ReadRunParamters() {
-    bool success = GetTuningParams(path_, &param_table_);
-    if (!success) {
-      LOG(WARNING) << "Get run parameter failed.";
+    extern const std::map<std::string, std::vector<unsigned int>>
+        kTuningParamsData;
+    if (!kTuningParamsData.empty()) {
+      for (auto it = kTuningParamsData.begin(); it != kTuningParamsData.end();
+           ++it) {
+        param_table_.emplace(it->first, std::vector<unsigned int>(
+            it->second.begin(), it->second.end()));
+      }
+    } else {
+      LOG(INFO) << "There is no tuned parameters.";
    }
  }


--- a/tools/bazel.rc
+++ b/tools/bazel.rc
+# Partially borrowed from tensorflow tools/bazel.rc
+
+# By default, we don't distinct target and host platfroms.
+# When doing cross compilation, use --config=cross_compile to distinct them.
+build --distinct_host_configuration=false
+build:cross_compile --distinct_host_configuration=true
+
+build --verbose_failures
+build --copt=-std=c++11
+build --copt=-D_GLIBCXX_USE_C99_MATH_TR1
+build --copt=-DMACE_OBFUSCATE_LITERALS
+
+# Usage example: bazel build --config android
+build:android --crosstool_top=//external:android/crosstool
+build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:android --config=cross_compile
+
+# Usage example: bazel build --config optimization 
+build:optimization -c opt
+build:optimization --copt=-O3
+build:optimization --strip=always
+
+# Address sanitizer
+build:asan --strip=never
+build:asan --copt -fsanitize=address
+build:asan --copt -D_FORTIFY_SOURCE
+build:asan --copt -DADDRESS_SANITIZER
+build:asan --copt -O0
+build:asan --copt -g
+build:asan --copt -fno-omit-frame-pointer
+build:asan --linkopt -fsanitize=address
+
+# Thread sanitizer
+build:tsan --strip=never
+build:tsan --copt -fsanitize=thread
+build:tsan --copt -DTHREAD_SANITIZER
+build:tsan --copt -DDYNAMIC_ANNOTATIONS_ENABLED=1
+build:tsan --copt -DDYNAMIC_ANNOTATIONS_EXTERNAL_IMPL=1
+build:tsan --copt -O0
+build:tsan --copt -fno-omit-frame-pointer
+build:tsan --linkopt -fsanitize=thread
+
+# Memory sanitizer
+build:msan --strip=never
+build:msan --copt -fsanitize=memory
+build:msan --copt -DADDRESS_SANITIZER
+build:msan --copt -O0
+build:msan --copt -fno-omit-frame-pointer
+build:msan --linkopt -fsanitize=memory
+
+# Undefined Behavior Sanitizer
+build:ubsan --strip=never
+build:ubsan --copt -fsanitize=undefined
+build:ubsan --copt -O0
+build:ubsan --copt -fno-omit-frame-pointer
+build:ubsan --linkopt -fsanitize=undefined
+build:ubsan --linkopt -lubsan
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -95,21 +95,6 @@ def parse_args():
        type=str2bool,
        default=False,
        help="Whether to run the target")
-    parser.add_argument(
-        "--valgrind",
-        type=bool,
-        default=False,
-        help="Whether to use valgrind to check memory error.")
-    parser.add_argument(
-        "--valgrind_path",
-        type=str,
-        default="/data/local/tmp/valgrind",
-        help="Valgrind install path.")
-    parser.add_argument(
-        "--valgrind_args",
-        type=str,
-        default="",
-        help="Valgrind command args.")
    parser.add_argument("--args", type=str, default="", help="Command args")
    parser.add_argument(
        "--stdout_processor",
@@ -121,6 +106,10 @@ def parse_args():
        type=str2bool,
        default=True,
        help="Whether to use neon optimization")
+    parser.add_argument(
+        '--address_sanitizer',
+        action="store_true",
+        help="Whether to enable AddressSanitizer")
    return parser.parse_known_args()


@@ -145,16 +134,17 @@ def main(unused_args):
    sh_commands.gen_encrypted_opencl_source()
    sh_commands.gen_compiled_opencl_source()
    sh_commands.gen_mace_version()
+    sh_commands.gen_tuning_param_code([])

    strip = "always"
    debug = False
-    if FLAGS.valgrind:
+    if FLAGS.address_sanitizer:
        strip = "never"
        debug = True
    for target_abi in target_abis:
-        sh_commands.bazel_build(target, strip=strip, abi=target_abi,
-                                disable_no_tuning_warning=True, debug=debug,
-                                enable_neon=FLAGS.enable_neon)
+        sh_commands.bazel_build(target, abi=target_abi,
+                                enable_neon=FLAGS.enable_neon,
+                                address_sanitizer=FLAGS.address_sanitizer)
        if FLAGS.run_target:
            for serialno in target_devices:
                if target_abi not in set(
@@ -162,28 +152,17 @@ def main(unused_args):
                    print("Skip device %s which does not support ABI %s" %
                          (serialno, target_abi))
                    continue
-                if FLAGS.valgrind:
-                    stdouts = sh_commands.adb_run_valgrind(
-                        serialno,
-                        host_bin_path,
-                        bin_name,
-                        valgrind_path=FLAGS.valgrind_path,
-                        valgrind_args=FLAGS.valgrind_args,
-                        args=FLAGS.args,
-                        opencl_profiling=1,
-                        vlog_level=0,
-                        device_bin_path="/data/local/tmp/mace",
-                        out_of_range_check=1)
-                else:
-                    stdouts = sh_commands.adb_run(
-                        serialno,
-                        host_bin_path,
-                        bin_name,
-                        args=FLAGS.args,
-                        opencl_profiling=1,
-                        vlog_level=0,
-                        device_bin_path="/data/local/tmp/mace",
-                        out_of_range_check=1)
+                stdouts = sh_commands.adb_run(
+                    target_abi,
+                    serialno,
+                    host_bin_path,
+                    bin_name,
+                    args=FLAGS.args,
+                    opencl_profiling=1,
+                    vlog_level=0,
+                    device_bin_path="/data/local/tmp/mace",
+                    out_of_range_check=1,
+                    address_sanitizer=FLAGS.address_sanitizer)
                device_properties = sh_commands.adb_getprop_by_serialno(
                    serialno)
                globals()[FLAGS.stdout_processor](stdouts, device_properties,

--- a/tools/common.py
+++ b/tools/common.py
@@ -13,23 +13,99 @@
 # limitations under the License.

 import enum
-import logging
 import re


 ################################
 # log
 ################################
-def init_logging():
-    logger = logging.getLogger('MACE')
-    logger.setLevel(logging.INFO)
+class CMDColors:
+    PURPLE = '\033[95m'
+    BLUE = '\033[94m'
+    GREEN = '\033[92m'
+    YELLOW = '\033[93m'
+    RED = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'

-    ch = logging.StreamHandler()
-    ch.setLevel(logging.INFO)
-    formatter = logging.Formatter(
-        '%(asctime)s [%(name)s] [%(levelname)s]: %(message)s')
-    ch.setFormatter(formatter)
-    logger.addHandler(ch)
+
+class MaceLogger:
+    @staticmethod
+    def header(message):
+        print CMDColors.PURPLE + message + CMDColors.ENDC
+
+    @staticmethod
+    def summary(message):
+        print CMDColors.GREEN + message + CMDColors.ENDC
+
+    @staticmethod
+    def info(message):
+        print message
+
+    @staticmethod
+    def warning(message):
+        print CMDColors.YELLOW + 'WARNING:' + message + CMDColors.ENDC
+
+    @staticmethod
+    def error(module, message):
+        print CMDColors.RED + 'ERROR: [' + module + '] '\
+              + message + CMDColors.ENDC
+        exit(1)
+
+
+def mace_check(condition, module, message):
+    if not condition:
+        MaceLogger.error(module, message)
+
+
+################################
+# String Formatter
+################################
+class StringFormatter:
+    @staticmethod
+    def table(header, data, title, align="R"):
+        data_size = len(data)
+        column_size = len(header)
+        column_length = [len(str(ele)) + 1 for ele in header]
+        for row_idx in range(data_size):
+            data_tuple = data[row_idx]
+            ele_size = len(data_tuple)
+            assert(ele_size == column_size)
+            for i in range(ele_size):
+                column_length[i] = max(column_length[i],
+                                       len(str(data_tuple[i])) + 1)
+
+        table_column_length = sum(column_length) + column_size + 1
+        dash_line = '-' * table_column_length + '\n'
+        header_line = '=' * table_column_length + '\n'
+        output = ""
+        output += dash_line
+        output += str(title).center(table_column_length) + '\n'
+        output += dash_line
+        output += '|' + '|'.join([str(header[i]).center(column_length[i])
+                                  for i in range(column_size)]) + '|\n'
+        output += header_line
+
+        for data_tuple in data:
+            ele_size = len(data_tuple)
+            row_list = []
+            for i in range(ele_size):
+                if align == "R":
+                    row_list.append(str(data_tuple[i]).rjust(column_length[i]))
+                elif align == "L":
+                    row_list.append(str(data_tuple[i]).ljust(column_length[i]))
+                elif align == "C":
+                    row_list.append(str(data_tuple[i])
+                                    .center(column_length[i]))
+            output += '|' + '|'.join(row_list) + "|\n" + dash_line
+        return output
+
+    @staticmethod
+    def block(message):
+        line_length = 10 + len(str(message)) + 10
+        star_line = '*' * line_length + '\n'
+        return star_line + str(message).center(line_length) + '\n' + star_line


 ################################

--- a/tools/example.yaml
+++ b/tools/example.yaml
-# example.yaml
-# Each yaml file describes a exported library (could be named [target_abi]/libmace-${filename}.a), 
-# which can contains more than one models
-# target_soc can get by `adb shell getprop | grep ro.board.platform | cut -d [ -f3 | cut -d ] -f1`
-target_abis: [armeabi-v7a, arm64-v8a]
-target_socs: [MSM8953]
-embed_model_data: 1
-models:
-  preview_net:
-    platform: tensorflow
-    model_file_path: path/to/model64.pb # also support http:// and https://
-    model_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
-    input_nodes: input_node
-    output_nodes: output_node
-    input_shapes: 1,64,64,3
-    output_shapes: 1,64,64,2
-    runtime: gpu
-    limit_opencl_kernel_time: 0
-    dsp_mode: 0
-    obfuscate: 1
-    fast_conv: 0
-    validation_inputs_data:
-      - path/to/input_files
-  capture_net:
-    platform: caffe
-    model_file_path: path/to/model.prototxt
-    weight_file_path: path/to/weight.caffemodel
-    model_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
-    weight_sha256_checksum: 05d92625809dc9edd6484882335c48c043397aed450a168d75eb8b538e86881a
-    input_nodes:
-      - input_node0
-      - input_node1
-    output_nodes:
-      - output_node0
-      - output_node1
-    input_shapes:
-      - 1,256,256,3
-      - 1,128,128,3
-    output_shapes:
-      - 1,256,256,2
-      - 1,1,1,2
-    runtime: cpu
-    limit_opencl_kernel_time: 1
-    dsp_mode: 0
-    obfuscate: 1
-    fast_conv: 0
--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
--- a/tools/validate.py
+++ b/tools/validate.py
@@ -36,6 +36,8 @@ import common
 #        --input_shape 1,64,64,3 \
 #        --output_shape 1,64,64,2

+VALIDATION_MODULE = 'VALIDATION'
+

 def load_data(file):
    if os.path.isfile(file):
@@ -51,18 +53,21 @@ def compare_output(platform, device_type, output_name, mace_out_value,
        mace_out_value = mace_out_value.reshape(-1)
        assert len(out_value) == len(mace_out_value)
        similarity = (1 - spatial.distance.cosine(out_value, mace_out_value))
-        print output_name, 'MACE VS', platform.upper(
-        ), 'similarity: ', similarity
+        common.MaceLogger.summary(
+            output_name + ' MACE VS ' + platform.upper()
+            + ' similarity: ' + str(similarity))
        if (device_type == "CPU" and similarity > 0.999) or \
            (device_type == "GPU" and similarity > 0.995) or \
                (device_type == "HEXAGON" and similarity > 0.930):
-            print '===================Similarity Test Passed=================='
+            common.MaceLogger.summary(
+                common.StringFormatter.block("Similarity Test Passed"))
        else:
-            print '===================Similarity Test Failed=================='
-            sys.exit(-1)
+            common.MaceLogger.error(
+                "", common.StringFormatter.block("Similarity Test Failed"))
    else:
-        print '=======================Skip empty node==================='
-        sys.exit(-1)
+        common.MaceLogger.error(
+            "", common.StringFormatter.block(
+                "Similarity Test failed because of empty output"))


 def normalize_tf_tensor_name(name):
@@ -76,8 +81,9 @@ def validate_tf_model(platform, device_type, model_file, input_file,
                      mace_out_file, input_names, input_shapes, output_names):
    import tensorflow as tf
    if not os.path.isfile(model_file):
-        print("Input graph file '" + model_file + "' does not exist!")
-        sys.exit(-1)
+        common.MaceLogger.error(
+            VALIDATION_MODULE,
+            "Input graph file '" + model_file + "' does not exist!")

    tf.reset_default_graph()
    input_graph_def = tf.GraphDef()
@@ -118,11 +124,13 @@ def validate_caffe_model(platform, device_type, model_file, input_file,
    os.environ['GLOG_minloglevel'] = '1'  # suprress Caffe verbose prints
    import caffe
    if not os.path.isfile(model_file):
-        print("Input graph file '" + model_file + "' does not exist!")
-        sys.exit(-1)
+        common.MaceLogger.error(
+            VALIDATION_MODULE,
+            "Input graph file '" + model_file + "' does not exist!")
    if not os.path.isfile(weight_file):
-        print("Input weight file '" + weight_file + "' does not exist!")
-        sys.exit(-1)
+        common.MaceLogger.error(
+            VALIDATION_MODULE,
+            "Input weight file '" + weight_file + "' does not exist!")

    caffe.set_mode_cpu()