From 51b14100dc00002c7eb5386a0589f36d87929c66 Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Fri, 30 Nov 2018 14:13:41 +0800
Subject: [PATCH] feature: support arm linux device

1. Abstact android and arm linux to one format
2. Support cross compilation for ARM linux
3. Related issue #36
---
 .gitlab-ci.yml                                |   50 +-
 docs/installation/env_requirement.rst         |    2 +-
 docs/user_guide/advanced_usage.rst            |   65 +-
 docs/user_guide/basic_usage.rst               |    8 +-
 mace/BUILD                                    |   18 +
 mace/examples/cli/BUILD                       |    9 +-
 mace/mace.bzl                                 |   13 +-
 mace/ops/depthwise_conv2d_test.cc             |    2 +-
 mace/ops/local_response_norm.cc               |    1 +
 mace/ops/resize_bicubic.cc                    |    1 +
 mace/ops/scalar_math.cc                       |    1 +
 mace/ops/softmax.cc                           |    3 +-
 mace/ops/strided_slice.cc                     |    1 +
 mace/ops/transpose.cc                         |    3 +-
 .../converter_tool/tensorflow_converter.py    |    2 +
 mace/python/tools/memory_optimizer.py         |  350 ++++++
 mace/python/tools/model_saver.py              |    5 +-
 mace/utils/quantize.h                         |    3 +-
 tools/bazel.rc                                |   43 +-
 tools/bazel_adb_run.py                        |   40 +-
 tools/build-standalone-lib.sh                 |   24 +
 tools/common.py                               |  339 ++++++
 tools/converter.py                            |  784 +++----------
 tools/device.py                               | 1004 +++++++++++++++++
 tools/image/image_to_tensor.py                |    5 +-
 tools/image/tensor_to_image.py                |    5 +-
 tools/sh_commands.py                          |  262 +----
 27 files changed, 2080 insertions(+), 963 deletions(-)
 create mode 100644 mace/python/tools/memory_optimizer.py
 create mode 100644 tools/device.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7ab11760..5e3a22c5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -47,8 +47,13 @@ ops_test:
   stage: ops_test
   script:
     - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
-    - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
-    - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS --enable_neon=false
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
+    - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64,armhf --target_socs=$TARGET_SOCS
+    - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64,armhf --target_socs=$TARGET_SOCS --enable_neon=false
 
 api_test:
   stage: api_test
@@ -68,14 +73,19 @@ extra_tests:
   stage: extra_tests
   script:
     - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
-    - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
+    - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS
 
 platform_compatible_tests:
   stage: platform_compatible_tests
   script:
     - bazel build mace/core:core --define openmp=true
-    - bazel build --config arm_linux --define openmp=true --define opencl=true --define neon=true //mace/libmace:libmace.so
-    - bazel build --config aarch64_linux --define openmp=true --define opencl=true --define neon=true //mace/libmace:libmace.so
+    - bazel build --config arm_linux_gnueabihf --define openmp=true --define opencl=true --define neon=true //mace/libmace:libmace.so
+    - bazel build --config aarch64_linux_gnu --define openmp=true --define opencl=true --define neon=true //mace/libmace:libmace.so
 
 build_libraries:
   stage: build_libraries
@@ -87,6 +97,11 @@ ndk_versions_compatible_tests:
   script:
     - DEFAULT_NDK_PATH=$ANDROID_NDK_HOME
     - prefix_path=${DEFAULT_NDK_PATH%android-ndk-*}
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
     - >
       for ndk in android-ndk-r12b android-ndk-r15c android-ndk-r16 android-ndk-r17b;
       do
@@ -96,8 +111,8 @@ ndk_versions_compatible_tests:
         export PATH=$ANDROID_NDK_HOME:$PATH;
         echo "ndk path: $ANDROID_NDK_HOME";
         if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
-        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
-        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
       fi
       done
     - export ANDROID_NDK_HOME=$DEFAULT_NDK_PATH
@@ -111,16 +126,27 @@ python_tools_tests:
     - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
     - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2.yml
     - >
-      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
+    - >
+      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a,arm64 --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
 
 model_tests:
   stage: model_tests
   script:
     - pwd
     - rm -rf mace-models
+    - rm -rf generic-mobile-devices
     - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
     - >
       for CONF_FILE in mace-models/mobilenet-v1/mobilenet-v1.yml mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml;
       do
@@ -131,8 +157,8 @@ model_tests:
     - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
     - >
       python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
     - rm -rf mace-models
 
 build_android_demo:
diff --git a/docs/installation/env_requirement.rst b/docs/installation/env_requirement.rst
index 17b564ea..12af0e61 100644
--- a/docs/installation/env_requirement.rst
+++ b/docs/installation/env_requirement.rst
@@ -35,7 +35,7 @@ Required dependencies
       - Required by model validation
     * - six
       - pip install -I six==1.11.0
-      - Required for Python 2 and 3 compatibility (TODO)
+      - Required for Python 2 and 3 compatibility
 
 Optional dependencies
 ---------------------
diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst
index 1c32b799..44d8b788 100644
--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -109,13 +109,75 @@ in one deployment file.
         sha256sum /path/to/your/file
 
 
+
 Advanced usage
 --------------
 
-There are two common advanced use cases:
+There are three common advanced use cases:
+  - run your model on the embedded device
   - converting model to C++ code.
   - tuning GPU kernels for a specific SoC.
 
+Run you model on the embedded device
+------------------
+
+MACE use ssh to connect embedded device, in this case we recommend you to push ``$HOME/.ssh/id_rsa.pub``
+to your device ``$HOME/.ssh/authorized_keys``
+
+.. code:: bash
+
+  cat ~/.ssh/id_rsa.pub | ssh -q {user}@{ip} "cat >> ~/.ssh/authorized_keys"
+
+This part will show you how to write your own device yaml config file.
+
+**Device yaml config file**
+
+The way to run your model on the embedded device is nearly the same as run on android, except you need give a device yaml config file.
+
+MACE get this yaml config via ``--device_yml`` argument, default config value is ``devices.yml``
+, when the yaml config file is not found. we treat as there is no available arm linux device, give a message
+and continue on other device such as plugged android phone.
+
+* **Example**
+
+    Here is an device yaml config demo.
+
+    .. literalinclude:: devices/demo_device_nanopi.yml
+        :language: yaml
+
+* **Configuration**
+
+.. list-table::
+    :header-rows: 1
+
+    * - Options
+      - Usage
+    * - target_abis
+      - Device supported abis, you can get it via ``dpkg --print-architecture`` and
+        ``dpkg --print-foreign-architectures`` command, if more than one abi is supported,
+        separate them by commas.
+    * - target_socs
+      - device soc, you can get it from device manual, we haven't found a way to get it in shell.
+    * - models
+      - device models full name, you can get via get ``lshw`` command (third party package, install it via your package manager).
+        see it's product value.
+    * - address
+      - Since we use ssh to connect device, ip address is required.
+    * - username
+      - login username, required.
+    * - password
+      - login password, optional when you can login into device without password
+
+
+.. note::
+
+    Some command tools:
+
+    .. code:: bash
+
+        # specify device yaml config file via --device_yml argument or put the file under working directory
+        python tools/converter.py run --config=/path/to/mace-models/mobilenet-v2/mobilenet-v2.yml --device_yml=/path/to/devices.yml
+
 Convert model(s) to C++ code
 --------------------------------
 
@@ -403,6 +465,7 @@ Reduce Library Size
         - It is recommended to use ``version script`` and ``strip`` feature when linking mace static library. The effect is remarkable.
 
 * Remove the unused ops.
+
 Remove the registration of the ops unused for your models in the ``mace/ops/ops_register.cc``,
 which will reduce the library size significantly. the final binary just link the registered ops' code.
 
diff --git a/docs/user_guide/basic_usage.rst b/docs/user_guide/basic_usage.rst
index 15a4d516..63b8968b 100644
--- a/docs/user_guide/basic_usage.rst
+++ b/docs/user_guide/basic_usage.rst
@@ -68,7 +68,8 @@ Here we use the mobilenet-v2 model as an example.
 
     .. note::
 
-        If you want to run on device/phone, please plug in at least one device/phone.
+        If you want to run on phone, please plug in at least one phone.
+        Or if you want to run on embedded device, please give a :doc:`advanced_usage`.
 
     .. code:: sh
 
@@ -245,7 +246,10 @@ to run and validate your model.
     	# Test model run time
         python tools/converter.py run --config=/path/to/your/model_deployment_file.yml --round=100
 
-    	# Validate the correctness by comparing the results against the
+        # If you want to run model on specified arm linux device, you should put device config file in the working directory or run with flag `--device_yml`
+        python tools/converter.py run --config=/path/to/mace-models/mobilenet-v2/mobilenet-v2.yml --device_yml=/path/to/devices.yml --example
+
+        # Validate the correctness by comparing the results against the
     	# original model and framework, measured with cosine distance for similarity.
     	python tools/converter.py run --config=/path/to/your/model_deployment_file.yml --validate
 
diff --git a/mace/BUILD b/mace/BUILD
index cf2e1e2d..4b7da51f 100644
--- a/mace/BUILD
+++ b/mace/BUILD
@@ -24,6 +24,24 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "arm_linux_aarch64",
+    values = {
+        "crosstool_top": "//tools/aarch64_compiler:toolchain",
+        "cpu": "aarch64",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "arm_linux_armhf",
+    values = {
+        "crosstool_top": "//tools/arm_compiler:toolchain",
+        "cpu": "armeabi-v7a",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "neon_enabled",
     define_values = {
diff --git a/mace/examples/cli/BUILD b/mace/examples/cli/BUILD
index be0de253..b2c2291c 100644
--- a/mace/examples/cli/BUILD
+++ b/mace/examples/cli/BUILD
@@ -1,10 +1,10 @@
 # Examples
 load(
     "//mace:mace.bzl",
-    "if_openmp_enabled",
     "if_android",
     "if_hexagon_enabled",
     "if_opencl_enabled",
+    "if_openmp_enabled",
 )
 
 cc_binary(
@@ -18,8 +18,9 @@ cc_binary(
     ]),
     linkopts = [
         "-lm",
+        "-ldl",
     ] + if_openmp_enabled([
-        "-fopenmp"
+        "-fopenmp",
     ]) + if_android([
         "-ldl",
         "-pie",
@@ -47,6 +48,7 @@ cc_binary(
     ]),
     linkopts = [
         "-lm",
+        "-ldl",
     ] + if_android([
         "-ldl",
         "-pie",
@@ -55,8 +57,7 @@ cc_binary(
     linkstatic = 0,
     deps = [
         "//external:gflags_nothreads",
-        "//mace/codegen:generated_mace_engine_factory",
         "//mace/codegen:generated_libmace",
+        "//mace/codegen:generated_mace_engine_factory",
     ],
 )
-
diff --git a/mace/mace.bzl b/mace/mace.bzl
index a7a6bc89..0215a086 100644
--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -24,6 +24,18 @@ def if_android_arm64(a):
       "//conditions:default": [],
   })
 
+def if_arm_linux_aarch64(a):
+  return select({
+      "//mace:arm_linux_aarch64": a,
+      "//conditions:default": [],
+  })
+
+def if_arm_linux_armhf(a):
+  return select({
+      "//mace:arm_linux_armhf": a,
+      "//conditions:default": []
+  })
+
 def if_neon_enabled(a):
   return select({
       "//mace:neon_enabled": a,
@@ -81,4 +93,3 @@ def encrypt_opencl_kernel_genrule():
       outs = ["opencl/encrypt_opencl_kernel.cc"],
       cmd = "cat $(SRCS) > $@;"
   )
-
diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc
index d757bf09..d9965658 100644
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -233,7 +233,7 @@ void TestNxNS12(const index_t height, const index_t width) {
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
     // generate random input
-    static unsigned int seed = time(NULL);
+    // static unsigned int seed = time(NULL);
     index_t batch = 1;
     index_t channel = 32;
     index_t multiplier = 1;
diff --git a/mace/ops/local_response_norm.cc b/mace/ops/local_response_norm.cc
index fb0cda7c..ff5dd32b 100644
--- a/mace/ops/local_response_norm.cc
+++ b/mace/ops/local_response_norm.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <cmath>
 
 #include "mace/core/operator.h"
 
diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc
index 3ccff3e6..6646afd0 100644
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -15,6 +15,7 @@
 #include "mace/ops/resize_bicubic.h"
 
 #include <algorithm>
+#include <cmath>
 #include <memory>
 #include <vector>
 
diff --git a/mace/ops/scalar_math.cc b/mace/ops/scalar_math.cc
index 297dcb33..a0d52192 100644
--- a/mace/ops/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <cmath>
 #include <vector>
 
 #include "mace/core/operator.h"
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
index 2518b407..c4bef3d9 100644
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <cmath>
 #include <limits>
 #include <memory>
 #include <vector>
@@ -106,7 +107,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
 
         float sum = 0;
         for (index_t c = 0; c < class_count; ++c) {
-          float exp_value = ::exp(input_ptr[c] - max_val);
+          float exp_value = std::exp(input_ptr[c] - max_val);
           sum += exp_value;
           output_ptr[c] = exp_value;
         }
diff --git a/mace/ops/strided_slice.cc b/mace/ops/strided_slice.cc
index b3b53ec8..89860b79 100644
--- a/mace/ops/strided_slice.cc
+++ b/mace/ops/strided_slice.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <cmath>
 #include <vector>
 
 #include "mace/core/operator.h"
diff --git a/mace/ops/transpose.cc b/mace/ops/transpose.cc
index 7c25ea4f..7588b8c3 100644
--- a/mace/ops/transpose.cc
+++ b/mace/ops/transpose.cc
@@ -16,8 +16,9 @@
 #include <arm_neon.h>
 #endif
 
-#include <vector>
 #include <algorithm>
+#include <cmath>
+#include <vector>
 
 #include "mace/core/operator.h"
 #include "mace/ops/transpose.h"
diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py
index 68e5ccb5..4b48ab9d 100644
--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -112,6 +112,8 @@ TFSupportedOps = [
 
 TFOpType = Enum('TFOpType', [(op, op) for op in TFSupportedOps], type=str)
 
+TFSupportedOps = [six.b(op) for op in TFSupportedOps]
+
 
 class TensorflowConverter(base_converter.ConverterInterface):
     """A class for convert tensorflow frozen model to mace model.
diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py
new file mode 100644
index 00000000..5b644779
--- /dev/null
+++ b/mace/python/tools/memory_optimizer.py
@@ -0,0 +1,350 @@
+# Copyright 2018 Xiaomi, Inc.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import operator
+
+import six
+from six.moves import reduce
+
+from mace.proto import mace_pb2
+
+from mace.python.tools.converter_tool import base_converter as cvt
+from mace.python.tools.converter_tool.base_converter import DeviceType
+from mace.python.tools.converter_tool.base_converter import ConverterUtil
+from mace.python.tools.converter_tool.base_converter import MaceKeyword
+from mace.python.tools.convert_util import calculate_image_shape
+from mace.python.tools.convert_util import OpenCLBufferType
+
+
+def MemoryTypeToStr(mem_type):
+    if mem_type == mace_pb2.CPU_BUFFER:
+        return 'CPU_BUFFER'
+    elif mem_type == mace_pb2.GPU_BUFFER:
+        return 'GPU_BUFFER'
+    elif mem_type == mace_pb2.GPU_IMAGE:
+        return 'GPU_IMAGE'
+    else:
+        return 'UNKNOWN'
+
+
+class MemoryBlock(object):
+    def __init__(self, mem_type, block):
+        self._mem_type = mem_type
+        self._block = block
+
+    @property
+    def mem_type(self):
+        return self._mem_type
+
+    @property
+    def block(self):
+        return self._block
+
+
+class MemoryOptimizer(object):
+    def __init__(self, net_def):
+        self.net_def = net_def
+        self.idle_mem = set()
+        self.op_mem = {}  # op_name->mem_id
+        self.mem_block = {}  # mem_id->[size] or mem_id->[x, y]
+        self.total_mem_count = 0
+        self.input_ref_counter = {}
+        self.mem_ref_counter = {}
+        ocl_mem_type_arg = ConverterUtil.get_arg(
+            net_def, MaceKeyword.mace_opencl_mem_type)
+        self.cl_mem_type = ocl_mem_type_arg.i if ocl_mem_type_arg is not None \
+            else None
+
+        consumers = {}
+        for op in net_def.op:
+            if not self.op_need_optimize_memory(op):
+                continue
+            for ipt in op.input:
+                if ipt not in consumers:
+                    consumers[ipt] = []
+                consumers[ipt].append(op)
+        # only ref op's output tensor
+        for op in net_def.op:
+            if not self.op_need_optimize_memory(op):
+                continue
+            for output in op.output:
+                tensor_name = output
+                if tensor_name in consumers:
+                    self.input_ref_counter[tensor_name] = \
+                        len(consumers[tensor_name])
+                else:
+                    self.input_ref_counter[tensor_name] = 0
+
+    def op_need_optimize_memory(self, op):
+        return True
+
+    def get_op_mem_block(self, op_type, output_shape, output_type):
+        data_type_size = 4
+        if output_type == mace_pb2.DT_UINT8:
+            data_type_size = 1
+        return MemoryBlock(mace_pb2.CPU_BUFFER,
+                           [reduce(operator.mul, output_shape, 1) *
+                            data_type_size])
+
+    def mem_size(self, memory_block):
+        return memory_block.block[0]
+
+    def sub_mem_block(self, mem_block1, mem_block2):
+        return self.mem_size(mem_block1) - self.mem_size(mem_block2)
+
+    def resize_mem_block(self, old_mem_block, op_mem_block):
+        return MemoryBlock(
+            old_mem_block.mem_type,
+            [max(old_mem_block.block[0], op_mem_block.block[0])])
+
+    def add_net_mem_blocks(self):
+        for mem in self.mem_block:
+            arena = self.net_def.mem_arena
+            block = arena.mem_block.add()
+            block.mem_id = mem
+            block.device_type = DeviceType.CPU.value
+            block.mem_type = self.mem_block[mem].mem_type
+            block.x = self.mem_block[mem].block[0]
+            block.y = 1
+
+    def get_total_origin_mem_size(self):
+        origin_mem_size = 0
+        for op in self.net_def.op:
+            if not self.op_need_optimize_memory(op):
+                continue
+            origin_mem_size += reduce(operator.mul,
+                                      op.output_shape[0].dims,
+                                      1)
+        return origin_mem_size
+
+    def get_total_optimized_mem_size(self):
+        optimized_mem_size = 0
+        for mem in self.mem_block:
+            print(mem, MemoryTypeToStr(self.mem_block[mem].mem_type),
+                  self.mem_block[mem].block)
+            optimized_mem_size += self.mem_size(self.mem_block[mem])
+        return optimized_mem_size
+
+    @staticmethod
+    def is_memory_reuse_op(op):
+        return op.type == 'Reshape' or op.type == 'Identity' \
+               or op.type == 'Squeeze' or op.type == 'ExpandDims'
+
+    def optimize(self):
+        for op in self.net_def.op:
+            if not self.op_need_optimize_memory(op):
+                continue
+            if not op.output_shape:
+                six.print_("WARNING: There is no output shape information to "
+                           "do memory optimization. %s (%s)" %
+                           (op.name, op.type), file=sys.stderr)
+                return
+            if len(op.output_shape) != len(op.output):
+                six.print_('WARNING: the number of output shape is '
+                           'not equal to the number of output.',
+                           file=sys.stderr)
+                return
+            for i in range(len(op.output)):
+                if self.is_memory_reuse_op(op):
+                    # make these ops reuse memory of input tensor
+                    mem_id = self.op_mem.get(op.input[0], -1)
+                else:
+                    output_type = mace_pb2.DT_FLOAT
+                    for arg in op.arg:
+                        if arg.name == 'T':
+                            output_type = arg.i
+                    if len(op.output_type) > i:
+                        output_type = op.output_type[i]
+                    op_mem_block = self.get_op_mem_block(
+                        op.type,
+                        op.output_shape[i].dims,
+                        output_type)
+                    mem_id = -1
+                    if len(self.idle_mem) > 0:
+                        best_mem_add_size = six.MAXSIZE
+                        best_mem_waste_size = six.MAXSIZE
+                        for mid in self.idle_mem:
+                            old_mem_block = self.mem_block[mid]
+                            if old_mem_block.mem_type != op_mem_block.mem_type:
+                                continue
+                            new_mem_block = self.resize_mem_block(
+                                old_mem_block, op_mem_block)
+                            add_mem_size = self.sub_mem_block(new_mem_block,
+                                                              old_mem_block)
+                            waste_mem_size = self.sub_mem_block(new_mem_block,
+                                                                op_mem_block)
+
+                            # minimize add_mem_size; if best_mem_add_size is 0,
+                            # then minimize waste_mem_size
+                            if (best_mem_add_size > 0 and
+                                add_mem_size < best_mem_add_size) \
+                                    or (best_mem_add_size == 0 and
+                                        waste_mem_size < best_mem_waste_size):
+                                best_mem_id = mid
+                                best_mem_add_size = add_mem_size
+                                best_mem_waste_size = waste_mem_size
+                                best_mem_block = new_mem_block
+
+                        # if add mem size < op mem size, then reuse it
+                        if best_mem_add_size <= self.mem_size(op_mem_block):
+                            self.mem_block[best_mem_id] = best_mem_block
+                            mem_id = best_mem_id
+                            self.idle_mem.remove(mem_id)
+
+                    if mem_id == -1:
+                        mem_id = self.total_mem_count
+                        self.total_mem_count += 1
+                        self.mem_block[mem_id] = op_mem_block
+
+                if mem_id != -1:
+                    op.mem_id.extend([mem_id])
+                    self.op_mem[op.output[i]] = mem_id
+                    if mem_id not in self.mem_ref_counter:
+                        self.mem_ref_counter[mem_id] = 1
+                    else:
+                        self.mem_ref_counter[mem_id] += 1
+
+            # de-ref input tensor mem
+            for idx in six.moves.range(len(op.input)):
+                ipt = op.input[idx]
+                if ipt in self.input_ref_counter:
+                    self.input_ref_counter[ipt] -= 1
+                    if self.input_ref_counter[ipt] == 0 \
+                            and ipt in self.op_mem:
+                        mem_id = self.op_mem[ipt]
+                        self.mem_ref_counter[mem_id] -= 1
+                        if self.mem_ref_counter[mem_id] == 0:
+                            self.idle_mem.add(self.op_mem[ipt])
+                    elif self.input_ref_counter[ipt] < 0:
+                        raise Exception('ref count is less than 0')
+
+        self.add_net_mem_blocks()
+
+        print("total op: %d" % len(self.net_def.op))
+        print("origin mem: %d, optimized mem: %d" % (
+            self.get_total_origin_mem_size(),
+            self.get_total_optimized_mem_size()))
+
+
+class GPUMemoryOptimizer(MemoryOptimizer):
+    def op_need_optimize_memory(self, op):
+        if op.type == MaceKeyword.mace_buffer_transform:
+            for arg in op.arg:
+                if arg.name == 'mode' and arg.i == 0:
+                    return False
+        return op.type != MaceKeyword.mace_buffer_inverse_transform
+
+    def get_op_image_mem_block(self, op_type, output_shape):
+        if op_type == 'WinogradTransform' or op_type == 'MatMul':
+            buffer_shape = list(output_shape) + [1]
+            mem_block = MemoryBlock(
+                mace_pb2.GPU_IMAGE,
+                calculate_image_shape(OpenCLBufferType.IN_OUT_HEIGHT,
+                                      buffer_shape))
+        elif op_type in ['Shape',
+                         'InferConv2dShape',
+                         'StridedSlice',
+                         'Stack',
+                         'ScalarMath']:
+            if len(output_shape) == 1:
+                mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
+                                        [output_shape[0], 1])
+            elif len(output_shape) == 0:
+                mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
+                                        [1, 1])
+            else:
+                raise Exception('%s output shape dim size is not 0 or 1.' %
+                                op_type)
+        else:
+            if len(output_shape) == 2:  # only support fc/softmax
+                buffer_shape = [output_shape[0], output_shape[1]]
+            elif len(output_shape) == 4:
+                buffer_shape = output_shape
+            else:
+                raise Exception('%s output shape dim size is not 2 or 4.' %
+                                op_type)
+            mem_block = MemoryBlock(
+                mace_pb2.GPU_IMAGE,
+                calculate_image_shape(OpenCLBufferType.IN_OUT_CHANNEL,
+                                      buffer_shape))
+        return mem_block
+
+    def get_op_buffer_mem_block(self, output_shape):
+        return MemoryBlock(mace_pb2.GPU_BUFFER,
+                           [reduce(operator.mul, output_shape, 1), 1])
+
+    def get_op_mem_block(self, op_type, output_shape, output_type):
+        if self.cl_mem_type == mace_pb2.GPU_IMAGE:
+            return self.get_op_image_mem_block(op_type, output_shape)
+        else:
+            return self.get_op_buffer_mem_block(output_shape)
+
+    def mem_size(self, memory_block):
+        if memory_block.mem_type == mace_pb2.GPU_IMAGE:
+            return memory_block.block[0] * memory_block.block[1] * 4
+        else:
+            return memory_block.block[0]
+
+    def resize_mem_block(self, old_mem_block, op_mem_block):
+        resize_mem_block = MemoryBlock(
+            old_mem_block.mem_type,
+            [
+                max(old_mem_block.block[0], op_mem_block.block[0]),
+                max(old_mem_block.block[1], op_mem_block.block[1])
+            ])
+
+        return resize_mem_block
+
+    def add_net_mem_blocks(self):
+        max_image_size_x = 0
+        max_image_size_y = 0
+        for mem in self.mem_block:
+            arena = self.net_def.mem_arena
+            block = arena.mem_block.add()
+            block.mem_id = mem
+            block.device_type = DeviceType.GPU.value
+            block.mem_type = self.mem_block[mem].mem_type
+            block.x = self.mem_block[mem].block[0]
+            block.y = self.mem_block[mem].block[1]
+            if self.mem_block[mem].mem_type == mace_pb2.GPU_IMAGE:
+                max_image_size_x = max(max_image_size_x, block.x)
+                max_image_size_y = max(max_image_size_y, block.y)
+
+        if self.cl_mem_type == mace_pb2.GPU_IMAGE:
+            # Update OpenCL max image size
+            net_ocl_max_img_size_arg = None
+            for arg in self.net_def.arg:
+                if arg.name == cvt.MaceKeyword.mace_opencl_max_image_size:
+                    net_ocl_max_img_size_arg = arg
+                    max_image_size_x = max(arg.ints[0], max_image_size_x)
+                    max_image_size_y = max(arg.ints[1], max_image_size_y)
+                    break
+            if net_ocl_max_img_size_arg is None:
+                net_ocl_max_img_size_arg = self.net_def.arg.add()
+                net_ocl_max_img_size_arg.name = \
+                    cvt.MaceKeyword.mace_opencl_max_image_size
+
+            net_ocl_max_img_size_arg.ints[:] = [max_image_size_x,
+                                                max_image_size_y]
+
+
+def optimize_gpu_memory(net_def):
+    mem_optimizer = GPUMemoryOptimizer(net_def)
+    mem_optimizer.optimize()
+
+
+def optimize_cpu_memory(net_def):
+    mem_optimizer = MemoryOptimizer(net_def)
+    mem_optimizer.optimize()
diff --git a/mace/python/tools/model_saver.py b/mace/python/tools/model_saver.py
index 95c79657..217b25b6 100644
--- a/mace/python/tools/model_saver.py
+++ b/mace/python/tools/model_saver.py
@@ -14,6 +14,7 @@
 
 import datetime
 import os
+import six
 import uuid
 import numpy as np
 import hashlib
@@ -34,8 +35,8 @@ class ModelFormat(object):
 
 def generate_obfuscated_name(namespace, name):
     md5 = hashlib.md5()
-    md5.update(namespace)
-    md5.update(name)
+    md5.update(six.b(namespace))
+    md5.update(six.b(name))
     md5_digest = md5.hexdigest()
 
     name = md5_digest[:8]
diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h
index 0755e708..baf07708 100644
--- a/mace/utils/quantize.h
+++ b/mace/utils/quantize.h
@@ -15,8 +15,9 @@
 #ifndef MACE_UTILS_QUANTIZE_H_
 #define MACE_UTILS_QUANTIZE_H_
 
-#include <limits>
 #include <algorithm>
+#include <cmath>
+#include <limits>
 
 namespace mace {
 
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 1863738e..15273b31 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -21,30 +21,29 @@ build:android --config=cross_compile
 build:android --crosstool_top=//external:android/crosstool
 build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
 
-# Usage example: bazel build --config arm_linux
-build:arm_linux --config=cross_compile
-build:arm_linux --crosstool_top=//tools/arm_compiler:toolchain
-build:arm_linux --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
-build:arm_linux --cpu=armeabi-v7a
-build:arm_linux --copt -mfloat-abi=hard
-build:arm_linux --copt -mfpu=neon 
-build:arm_linux --copt -Wno-ignored-attributes
-build:arm_linux --copt -Wno-unused-function
-build:arm_linux --copt -Wno-sequence-point
-build:arm_linux --copt -Wno-implicit-fallthrough
-build:arm_linux --copt -Wno-psabi
+# Usage example: bazel build --config arm_linux_gnueabihf
+build:arm_linux_gnueabihf --config=cross_compile
+build:arm_linux_gnueabihf --crosstool_top=//tools/arm_compiler:toolchain
+build:arm_linux_gnueabihf --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:arm_linux_gnueabihf --cpu=armeabi-v7a
+build:arm_linux_gnueabihf --copt -mfloat-abi=hard
+build:arm_linux_gnueabihf --copt -mfpu=neon
+build:arm_linux_gnueabihf --copt -Wno-ignored-attributes
+build:arm_linux_gnueabihf --copt -Wno-unused-function
+build:arm_linux_gnueabihf --copt -Wno-sequence-point
+build:arm_linux_gnueabihf --copt -Wno-implicit-fallthrough
 
-# Usage example: bazel build --config aarch64_linux
-build:aarch64_linux --config=cross_compile
-build:aarch64_linux --crosstool_top=//tools/aarch64_compiler:toolchain
-build:aarch64_linux --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
-build:aarch64_linux --cpu=aarch64
-build:aarch64_linux --copt -Wno-ignored-attributes
-build:aarch64_linux --copt -Wno-unused-function
-build:aarch64_linux --copt -Wno-sequence-point
-build:aarch64_linux --copt -Wno-implicit-fallthrough
+# Usage example: bazel build --config aarch64_linux_gnu
+build:aarch64_linux_gnu --config=cross_compile
+build:aarch64_linux_gnu --crosstool_top=//tools/aarch64_compiler:toolchain
+build:aarch64_linux_gnu --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:aarch64_linux_gnu --cpu=aarch64
+build:aarch64_linux_gnu --copt -Wno-ignored-attributes
+build:aarch64_linux_gnu --copt -Wno-unused-function
+build:aarch64_linux_gnu --copt -Wno-sequence-point
+build:aarch64_linux_gnu --copt -Wno-implicit-fallthrough
 
-# Usage example: bazel build --config optimization 
+# Usage example: bazel build --config optimization
 build:optimization -c opt
 build:optimization --copt=-O3
 build:optimization --linkopt=-Wl,--strip-all
diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py
index 12cdd20f..6906015c 100644
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -26,9 +26,9 @@ import sys
 
 import sh_commands
 
+from common import *
 
-def stdout_processor(stdout, device_properties, abi):
-    pass
+from device import DeviceWrapper, DeviceManager
 
 
 def unittest_stdout_processor(stdout, device_properties, abi):
@@ -39,7 +39,7 @@ def unittest_stdout_processor(stdout, device_properties, abi):
             raise Exception("Command failed")
 
 
-def ops_benchmark_stdout_processor(stdout, device_properties, abi):
+def ops_benchmark_stdout_processor(stdout, dev, abi):
     stdout_lines = stdout.split("\n")
     metrics = {}
     for line in stdout_lines:
@@ -52,8 +52,8 @@ def ops_benchmark_stdout_processor(stdout, device_properties, abi):
             metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
             metrics["%s.gmacc_per_sec" % parts[0]] = parts[4]
 
-    platform = device_properties["ro.board.platform"].replace(" ", "-")
-    model = device_properties["ro.product.model"].replace(" ", "-")
+    platform = dev[YAMLKeyword.target_socs]
+    model = dev[YAMLKeyword.models]
     tags = {
         "ro.board.platform": platform,
         "ro.product.model": model,
@@ -87,7 +87,7 @@ def parse_args():
         type=str,
         default="all",
         help="SoCs (ro.board.platform from getprop) to build, "
-        "comma seperated list or all/random")
+             "comma seperated list or all/random")
     parser.add_argument(
         "--target", type=str, default="//...", help="Bazel target to build")
     parser.add_argument(
@@ -115,14 +115,22 @@ def parse_args():
         type=str2bool,
         default=False,
         help="Whether to use simpleperf stat")
+    parser.add_argument(
+        '--device_yml',
+        type=str,
+        default='',
+        help='embedded linux device config yml file'
+    )
     return parser.parse_known_args()
 
 
 def main(unused_args):
     target_socs = None
+    target_devices = DeviceManager.list_devices(FLAGS.device_yml)
     if FLAGS.target_socs != "all" and FLAGS.target_socs != "random":
         target_socs = set(FLAGS.target_socs.split(','))
-    target_devices = sh_commands.get_target_socs_serialnos(target_socs)
+        target_devices = [dev for dev in target_devices
+                          if dev[YAMLKeyword.target_socs] in target_socs]
     if FLAGS.target_socs == "random":
         unlocked_devices = \
             [d for d in target_devices if not sh_commands.is_device_locked(d)]
@@ -136,31 +144,29 @@ def main(unused_args):
     target_abis = FLAGS.target_abis.split(',')
 
     for target_abi in target_abis:
+        toolchain = infer_toolchain(target_abi)
         sh_commands.bazel_build(target, abi=target_abi,
+                                toolchain=toolchain,
                                 enable_neon=FLAGS.enable_neon,
                                 address_sanitizer=FLAGS.address_sanitizer)
         if FLAGS.run_target:
-            for serialno in target_devices:
-                if target_abi not in set(
-                        sh_commands.adb_supported_abis(serialno)):
+            for dev in target_devices:
+                if target_abi not in dev[YAMLKeyword.target_abis]:
                     print("Skip device %s which does not support ABI %s" %
-                          (serialno, target_abi))
+                          (dev, target_abi))
                     continue
-                stdouts = sh_commands.adb_run(
+                device_wrapper = DeviceWrapper(dev)
+                stdouts = device_wrapper.run(
                     target_abi,
-                    serialno,
                     host_bin_path,
                     bin_name,
                     args=FLAGS.args,
                     opencl_profiling=True,
                     vlog_level=0,
-                    device_bin_path="/data/local/tmp/mace",
                     out_of_range_check=True,
                     address_sanitizer=FLAGS.address_sanitizer,
                     simpleperf=FLAGS.simpleperf)
-                device_properties = sh_commands.adb_getprop_by_serialno(
-                    serialno)
-                globals()[FLAGS.stdout_processor](stdouts, device_properties,
+                globals()[FLAGS.stdout_processor](stdouts, dev,
                                                   target_abi)
 
 
diff --git a/tools/build-standalone-lib.sh b/tools/build-standalone-lib.sh
index 34866596..24cba4cf 100755
--- a/tools/build-standalone-lib.sh
+++ b/tools/build-standalone-lib.sh
@@ -22,6 +22,14 @@ mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu
 rm -rf $LIB_DIR/linux-x86-64
 mkdir -p $LIB_DIR/linux-x86-64
 
+rm -rf $LIB_DIR/arm_linux_gnueabihf
+mkdir -p $LIB_DIR/arm_linux_gnueabihf/cpu_gpu
+
+rm -rf $LIB_DIR/aarch64_linux_gnu
+mkdir -p $LIB_DIR/aarch64_linux_gnu/cpu_gpu
+
+
+
 # build shared libraries
 echo "build shared lib for armeabi-v7a + cpu_gpu_dsp"
 bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a
@@ -36,6 +44,14 @@ echo "build shared lib for arm64-v8a + cpu_gpu"
 bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=arm64-v8a
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu/
 
+echo "build shared lib for arm_linux_gnueabihf + cpu_gpu"
+bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true
+cp bazel-bin/mace/libmace/libmace.so  $LIB_DIR/arm_linux_gnueabihf/cpu_gpu/
+
+echo "build shared lib for aarch64_linux_gnu + cpu_gpu"
+bazel build --config aarch64_linux_gnu  --config optimization mace/libmace:libmace_dynamic  --define neon=true --define openmp=true --define opencl=true
+cp bazel-bin/mace/libmace/libmace.so  $LIB_DIR/aarch64_linux_gnu/cpu_gpu/
+
 if [[ "$OSTYPE" != "darwin"* ]];then
 	echo "build shared lib for linux-x86-64"
 	bazel build mace/libmace:libmace_dynamic --config optimization --define quantize=true --define openmp=true
@@ -56,6 +72,14 @@ echo "build static lib for arm64-v8a + cpu_gpu"
 bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=arm64-v8a
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu/
 
+echo "build static lib for arm_linux_gnueabihf + cpu_gpu"
+bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true
+cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm_linux_gnueabihf/cpu_gpu/
+
+echo "build static lib for aarch64_linux_gnu + cpu_gpu"
+bazel build --config aarch64_linux_gnu --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true
+cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/aarch64_linux_gnu/cpu_gpu/
+
 if [[ "$OSTYPE" != "darwin"* ]];then
 	echo "build static lib for linux-x86-64"
 	bazel build mace/libmace:libmace_static --config optimization --define quantize=true --define openmp=true
diff --git a/tools/common.py b/tools/common.py
index 9ba294dd..2185b274 100644
--- a/tools/common.py
+++ b/tools/common.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 import enum
+import hashlib
 import re
+import os
 
 import six
 
@@ -135,3 +137,340 @@ def formatted_file_name(input_file_name, input_name):
     for c in input_name:
         res += c if c.isalnum() else '_'
     return res
+
+
+def md5sum(s):
+    md5 = hashlib.md5()
+    md5.update(s.encode('utf-8'))
+    return md5.hexdigest()
+
+
+def get_build_binary_dir(library_name, target_abi):
+    return "%s/%s/%s/%s" % (
+        BUILD_OUTPUT_DIR, library_name, BUILD_TMP_DIR_NAME, target_abi)
+
+
+def get_model_lib_output_path(library_name, abi):
+    lib_output_path = os.path.join(BUILD_OUTPUT_DIR, library_name,
+                                   MODEL_OUTPUT_DIR_NAME, abi,
+                                   "%s.a" % library_name)
+    return lib_output_path
+
+
+def check_model_converted(library_name, model_name,
+                          model_graph_format, model_data_format,
+                          abi):
+    model_output_dir = \
+        '%s/%s/%s' % (BUILD_OUTPUT_DIR, library_name, MODEL_OUTPUT_DIR_NAME)
+    if model_graph_format == ModelFormat.file:
+        mace_check(os.path.exists("%s/%s.pb" % (model_output_dir, model_name)),
+                   ModuleName.RUN,
+                   "You should convert model first.")
+    else:
+        model_lib_path = get_model_lib_output_path(library_name, abi)
+        mace_check(os.path.exists(model_lib_path),
+                   ModuleName.RUN,
+                   "You should convert model first.")
+    if model_data_format == ModelFormat.file:
+        mace_check(os.path.exists("%s/%s.data" %
+                                  (model_output_dir, model_name)),
+                   ModuleName.RUN,
+                   "You should convert model first.")
+
+
+def parse_device_type(runtime):
+    device_type = ""
+
+    if runtime == RuntimeType.dsp:
+        device_type = DeviceType.HEXAGON
+    elif runtime == RuntimeType.gpu:
+        device_type = DeviceType.GPU
+    elif runtime == RuntimeType.cpu:
+        device_type = DeviceType.CPU
+
+    return device_type
+
+
+def sha256_checksum(fname):
+    hash_func = hashlib.sha256()
+    with open(fname, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_func.update(chunk)
+    return hash_func.hexdigest()
+
+
+def get_model_files(model_file_path,
+                    model_sha256_checksum,
+                    model_output_dir,
+                    weight_file_path="",
+                    weight_sha256_checksum=""):
+    model_file = model_file_path
+    weight_file = weight_file_path
+
+    if model_file_path.startswith("http://") or \
+            model_file_path.startswith("https://"):
+        model_file = model_output_dir + "/" + md5sum(model_file_path) + ".pb"
+        if not os.path.exists(model_file) or \
+                sha256_checksum(model_file) != model_sha256_checksum:
+            MaceLogger.info("Downloading model, please wait ...")
+            six.moves.urllib.request.urlretrieve(model_file_path, model_file)
+            MaceLogger.info("Model downloaded successfully.")
+
+    if sha256_checksum(model_file) != model_sha256_checksum:
+        MaceLogger.error(ModuleName.MODEL_CONVERTER,
+                         "model file sha256checksum not match")
+
+    if weight_file_path.startswith("http://") or \
+            weight_file_path.startswith("https://"):
+        weight_file = \
+            model_output_dir + "/" + md5sum(weight_file_path) + ".caffemodel"
+        if not os.path.exists(weight_file) or \
+                sha256_checksum(weight_file) != weight_sha256_checksum:
+            MaceLogger.info("Downloading model weight, please wait ...")
+            six.moves.urllib.request.urlretrieve(weight_file_path, weight_file)
+            MaceLogger.info("Model weight downloaded successfully.")
+
+    if weight_file:
+        if sha256_checksum(weight_file) != weight_sha256_checksum:
+            MaceLogger.error(ModuleName.MODEL_CONVERTER,
+                             "weight file sha256checksum not match")
+
+    return model_file, weight_file
+
+
+def get_opencl_binary_output_path(library_name, target_abi, device):
+    target_soc = device.target_socs
+    device_model = device.models
+    return '%s/%s/%s/%s/%s_%s.%s.%s.bin' % \
+           (BUILD_OUTPUT_DIR,
+            library_name,
+            OUTPUT_OPENCL_BINARY_DIR_NAME,
+            target_abi,
+            library_name,
+            OUTPUT_OPENCL_BINARY_FILE_NAME,
+            device_model,
+            target_soc)
+
+
+def get_opencl_parameter_output_path(library_name, target_abi, device):
+    target_soc = device.target_socs
+    device_model = device.models
+    return '%s/%s/%s/%s/%s_%s.%s.%s.bin' % \
+           (BUILD_OUTPUT_DIR,
+            library_name,
+            OUTPUT_OPENCL_BINARY_DIR_NAME,
+            target_abi,
+            library_name,
+            OUTPUT_OPENCL_PARAMETER_FILE_NAME,
+            device_model,
+            target_soc)
+
+
+def get_build_model_dirs(library_name,
+                         model_name,
+                         target_abi,
+                         device,
+                         model_file_path):
+    models = device.models
+    target_socs = device.target_socs
+    model_path_digest = md5sum(model_file_path)
+    model_output_base_dir = '{}/{}/{}/{}/{}'.format(
+        BUILD_OUTPUT_DIR, library_name, BUILD_TMP_DIR_NAME,
+        model_name, model_path_digest)
+
+    if target_abi == ABIType.host:
+        model_output_dir = '%s/%s' % (model_output_base_dir, target_abi)
+    elif not target_socs or not device.address:
+        model_output_dir = '%s/%s/%s' % (model_output_base_dir,
+                                         BUILD_TMP_GENERAL_OUTPUT_DIR_NAME,
+                                         target_abi)
+    else:
+        model_output_dir = '{}/{}_{}/{}'.format(
+            model_output_base_dir,
+            models,
+            target_socs,
+            target_abi
+        )
+
+    mace_model_dir = '{}/{}/{}'.format(
+        BUILD_OUTPUT_DIR, library_name, MODEL_OUTPUT_DIR_NAME
+    )
+
+    return model_output_base_dir, model_output_dir, mace_model_dir
+
+
+def abi_to_internal(abi):
+    if abi in [ABIType.armeabi_v7a, ABIType.arm64_v8a]:
+        return abi
+    if abi == ABIType.arm64:
+        return ABIType.aarch64
+    if abi == ABIType.armhf:
+        return ABIType.armeabi_v7a
+
+
+def infer_toolchain(abi):
+    if abi in [ABIType.armeabi_v7a, ABIType.arm64_v8a]:
+        return ToolchainType.android
+    if abi == ABIType.armhf:
+        return ToolchainType.arm_linux_gnueabihf
+    if abi == ABIType.arm64:
+        return ToolchainType.aarch64_linux_gnu
+    return ''
+
+
+################################
+# YAML key word
+################################
+class YAMLKeyword(object):
+    library_name = 'library_name'
+    target_abis = 'target_abis'
+    target_socs = 'target_socs'
+    model_graph_format = 'model_graph_format'
+    model_data_format = 'model_data_format'
+    models = 'models'
+    platform = 'platform'
+    device_name = 'device_name'
+    system = 'system'
+    address = 'address'
+    username = 'username'
+    password = 'password'
+    model_file_path = 'model_file_path'
+    model_sha256_checksum = 'model_sha256_checksum'
+    weight_file_path = 'weight_file_path'
+    weight_sha256_checksum = 'weight_sha256_checksum'
+    subgraphs = 'subgraphs'
+    input_tensors = 'input_tensors'
+    input_shapes = 'input_shapes'
+    input_ranges = 'input_ranges'
+    output_tensors = 'output_tensors'
+    output_shapes = 'output_shapes'
+    check_tensors = 'check_tensors'
+    check_shapes = 'check_shapes'
+    runtime = 'runtime'
+    data_type = 'data_type'
+    input_data_types = 'input_data_types'
+    input_data_formats = 'input_data_formats'
+    output_data_formats = 'output_data_formats'
+    limit_opencl_kernel_time = 'limit_opencl_kernel_time'
+    nnlib_graph_mode = 'nnlib_graph_mode'
+    obfuscate = 'obfuscate'
+    winograd = 'winograd'
+    quantize = 'quantize'
+    quantize_range_file = 'quantize_range_file'
+    change_concat_ranges = 'change_concat_ranges'
+    validation_inputs_data = 'validation_inputs_data'
+    validation_threshold = 'validation_threshold'
+    graph_optimize_options = 'graph_optimize_options'  # internal use for now
+    cl_mem_type = 'cl_mem_type'
+
+
+################################
+# SystemType
+################################
+class SystemType:
+    host = 'host'
+    android = 'android'
+    arm_linux = 'arm_linux'
+
+
+################################
+# common device str
+################################
+
+PHONE_DATA_DIR = '/data/local/tmp/mace_run'
+DEVICE_DATA_DIR = '/tmp/data/mace_run'
+DEVICE_INTERIOR_DIR = PHONE_DATA_DIR + "/interior"
+BUILD_OUTPUT_DIR = 'builds'
+BUILD_TMP_DIR_NAME = '_tmp'
+BUILD_DOWNLOADS_DIR = BUILD_OUTPUT_DIR + '/downloads'
+BUILD_TMP_GENERAL_OUTPUT_DIR_NAME = 'general'
+MODEL_OUTPUT_DIR_NAME = 'model'
+EXAMPLE_STATIC_NAME = "example_static"
+EXAMPLE_DYNAMIC_NAME = "example_dynamic"
+EXAMPLE_STATIC_TARGET = "//mace/examples/cli:" + EXAMPLE_STATIC_NAME
+EXAMPLE_DYNAMIC_TARGET = "//mace/examples/cli:" + EXAMPLE_DYNAMIC_NAME
+MACE_RUN_STATIC_NAME = "mace_run_static"
+MACE_RUN_DYNAMIC_NAME = "mace_run_dynamic"
+MACE_RUN_STATIC_TARGET = "//mace/tools/validation:" + MACE_RUN_STATIC_NAME
+MACE_RUN_DYNAMIC_TARGET = "//mace/tools/validation:" + MACE_RUN_DYNAMIC_NAME
+CL_COMPILED_BINARY_FILE_NAME = "mace_cl_compiled_program.bin"
+BUILD_TMP_OPENCL_BIN_DIR = 'opencl_bin'
+LIBMACE_DYNAMIC_PATH = "bazel-bin/mace/libmace/libmace.so"
+CL_TUNED_PARAMETER_FILE_NAME = "mace_run.config"
+MODEL_HEADER_DIR_PATH = 'include/mace/public'
+OUTPUT_LIBRARY_DIR_NAME = 'lib'
+OUTPUT_OPENCL_BINARY_DIR_NAME = 'opencl'
+OUTPUT_OPENCL_BINARY_FILE_NAME = 'compiled_opencl_kernel'
+OUTPUT_OPENCL_PARAMETER_FILE_NAME = 'tuned_opencl_parameter'
+CODEGEN_BASE_DIR = 'mace/codegen'
+MODEL_CODEGEN_DIR = CODEGEN_BASE_DIR + '/models'
+ENGINE_CODEGEN_DIR = CODEGEN_BASE_DIR + '/engine'
+LIB_CODEGEN_DIR = CODEGEN_BASE_DIR + '/lib'
+LIBMACE_SO_TARGET = "//mace/libmace:libmace.so"
+LIBMACE_STATIC_TARGET = "//mace/libmace:libmace_static"
+LIBMACE_STATIC_PATH = "bazel-genfiles/mace/libmace/libmace.a"
+MODEL_LIB_TARGET = "//mace/codegen:generated_models"
+MODEL_LIB_PATH = "bazel-genfiles/mace/codegen/libgenerated_models.a"
+QUANTIZE_STAT_TARGET = "//mace/tools/quantization:quantize_stat"
+BM_MODEL_STATIC_NAME = "benchmark_model_static"
+BM_MODEL_DYNAMIC_NAME = "benchmark_model_dynamic"
+BM_MODEL_STATIC_TARGET = "//mace/benchmark:" + BM_MODEL_STATIC_NAME
+BM_MODEL_DYNAMIC_TARGET = "//mace/benchmark:" + BM_MODEL_DYNAMIC_NAME
+ALL_SOC_TAG = 'all'
+
+
+################################
+# Model File Format
+################################
+class ModelFormat(object):
+    file = 'file'
+    code = 'code'
+
+
+################################
+# ABI Type
+################################
+class ABIType(object):
+    armeabi_v7a = 'armeabi-v7a'
+    arm64_v8a = 'arm64-v8a'
+    arm64 = 'arm64'
+    aarch64 = 'aarch64'
+    armhf = 'armhf'
+    host = 'host'
+
+
+################################
+# Module name
+################################
+class ModuleName(object):
+    YAML_CONFIG = 'YAML CONFIG'
+    MODEL_CONVERTER = 'Model Converter'
+    RUN = 'RUN'
+    BENCHMARK = 'Benchmark'
+
+
+#################################
+# mace lib type
+#################################
+class MACELibType(object):
+    static = 0
+    dynamic = 1
+
+
+#################################
+# Run time type
+#################################
+class RuntimeType(object):
+    cpu = 'cpu'
+    gpu = 'gpu'
+    dsp = 'dsp'
+    cpu_gpu = 'cpu+gpu'
+
+
+#################################
+# Tool chain Type
+#################################
+class ToolchainType:
+    android = 'android'
+    arm_linux_gnueabihf = 'arm_linux_gnueabihf'
+    aarch64_linux_gnu = 'aarch64_linux_gnu'
diff --git a/tools/converter.py b/tools/converter.py
index 1d712dfd..7646692e 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -18,7 +18,6 @@ import hashlib
 import os
 import re
 import sh
-import subprocess
 import sys
 import urllib
 import yaml
@@ -27,14 +26,9 @@ from enum import Enum
 import six
 
 import sh_commands
-from sh_commands import BuildType
-from sh_commands import ModelFormat
 
-from common import CaffeEnvType
-from common import DeviceType
-from common import mace_check
-from common import MaceLogger
-from common import StringFormatter
+from common import *
+from device import DeviceWrapper, DeviceManager
 
 ################################
 # set environment
@@ -44,69 +38,20 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 ################################
 # common definitions
 ################################
-BUILD_OUTPUT_DIR = 'builds'
-BUILD_DOWNLOADS_DIR = BUILD_OUTPUT_DIR + '/downloads'
-PHONE_DATA_DIR = "/data/local/tmp/mace_run"
-MODEL_OUTPUT_DIR_NAME = 'model'
-MODEL_HEADER_DIR_PATH = 'include/mace/public'
-BUILD_TMP_DIR_NAME = '_tmp'
-BUILD_TMP_GENERAL_OUTPUT_DIR_NAME = 'general'
-OUTPUT_LIBRARY_DIR_NAME = 'lib'
-OUTPUT_OPENCL_BINARY_DIR_NAME = 'opencl'
-OUTPUT_OPENCL_BINARY_FILE_NAME = 'compiled_opencl_kernel'
-OUTPUT_OPENCL_PARAMETER_FILE_NAME = 'tuned_opencl_parameter'
-CL_COMPILED_BINARY_FILE_NAME = "mace_cl_compiled_program.bin"
-CL_TUNED_PARAMETER_FILE_NAME = "mace_run.config"
-CODEGEN_BASE_DIR = 'mace/codegen'
-MODEL_CODEGEN_DIR = CODEGEN_BASE_DIR + '/models'
-ENGINE_CODEGEN_DIR = CODEGEN_BASE_DIR + '/engine'
-LIB_CODEGEN_DIR = CODEGEN_BASE_DIR + '/lib'
-LIBMACE_SO_TARGET = "//mace/libmace:libmace.so"
-LIBMACE_STATIC_TARGET = "//mace/libmace:libmace_static"
-LIBMACE_STATIC_PATH = "bazel-genfiles/mace/libmace/libmace.a"
-LIBMACE_DYNAMIC_PATH = "bazel-bin/mace/libmace/libmace.so"
-MODEL_LIB_TARGET = "//mace/codegen:generated_models"
-MODEL_LIB_PATH = "bazel-genfiles/mace/codegen/libgenerated_models.a"
-MACE_RUN_STATIC_NAME = "mace_run_static"
-MACE_RUN_DYNAMIC_NAME = "mace_run_dynamic"
-MACE_RUN_STATIC_TARGET = "//mace/tools/validation:" + MACE_RUN_STATIC_NAME
-MACE_RUN_DYNAMIC_TARGET = "//mace/tools/validation:" + MACE_RUN_DYNAMIC_NAME
-EXAMPLE_STATIC_NAME = "example_static"
-EXAMPLE_DYNAMIC_NAME = "example_dynamic"
-EXAMPLE_STATIC_TARGET = "//mace/examples/cli:" + EXAMPLE_STATIC_NAME
-EXAMPLE_DYNAMIC_TARGET = "//mace/examples/cli:" + EXAMPLE_DYNAMIC_NAME
-BM_MODEL_STATIC_NAME = "benchmark_model_static"
-BM_MODEL_DYNAMIC_NAME = "benchmark_model_dynamic"
-BM_MODEL_STATIC_TARGET = "//mace/benchmark:" + BM_MODEL_STATIC_NAME
-BM_MODEL_DYNAMIC_TARGET = "//mace/benchmark:" + BM_MODEL_DYNAMIC_NAME
-DEVICE_INTERIOR_DIR = PHONE_DATA_DIR + "/interior"
-BUILD_TMP_OPENCL_BIN_DIR = 'opencl_bin'
-ALL_SOC_TAG = 'all'
 
 ABITypeStrs = [
     'armeabi-v7a',
     'arm64-v8a',
+    'arm64',
+    'armhf',
     'host',
 ]
 
-
-class ABIType(object):
-    armeabi_v7a = 'armeabi-v7a'
-    arm64_v8a = 'arm64-v8a'
-    host = 'host'
-
-
 ModelFormatStrs = [
     "file",
     "code",
 ]
 
-
-class MACELibType(object):
-    static = 0
-    dynamic = 1
-
-
 PlatformTypeStrs = [
     "tensorflow",
     "caffe",
@@ -121,14 +66,6 @@ RuntimeTypeStrs = [
     "cpu+gpu"
 ]
 
-
-class RuntimeType(object):
-    cpu = 'cpu'
-    gpu = 'gpu'
-    dsp = 'dsp'
-    cpu_gpu = 'cpu+gpu'
-
-
 InputDataTypeStrs = [
     "int32",
     "float32",
@@ -174,51 +111,6 @@ class DefaultValues(object):
     gpu_priority_hint = 3,
 
 
-class YAMLKeyword(object):
-    library_name = 'library_name'
-    target_abis = 'target_abis'
-    target_socs = 'target_socs'
-    model_graph_format = 'model_graph_format'
-    model_data_format = 'model_data_format'
-    models = 'models'
-    platform = 'platform'
-    model_file_path = 'model_file_path'
-    model_sha256_checksum = 'model_sha256_checksum'
-    weight_file_path = 'weight_file_path'
-    weight_sha256_checksum = 'weight_sha256_checksum'
-    subgraphs = 'subgraphs'
-    input_tensors = 'input_tensors'
-    input_shapes = 'input_shapes'
-    input_ranges = 'input_ranges'
-    output_tensors = 'output_tensors'
-    output_shapes = 'output_shapes'
-    check_tensors = 'check_tensors'
-    check_shapes = 'check_shapes'
-    runtime = 'runtime'
-    data_type = 'data_type'
-    input_data_types = 'input_data_types'
-    input_data_formats = 'input_data_formats'
-    output_data_formats = 'output_data_formats'
-    limit_opencl_kernel_time = 'limit_opencl_kernel_time'
-    nnlib_graph_mode = 'nnlib_graph_mode'
-    obfuscate = 'obfuscate'
-    winograd = 'winograd'
-    quantize = 'quantize'
-    quantize_range_file = 'quantize_range_file'
-    change_concat_ranges = 'change_concat_ranges'
-    validation_inputs_data = 'validation_inputs_data'
-    validation_threshold = 'validation_threshold'
-    graph_optimize_options = 'graph_optimize_options'  # internal use for now
-    cl_mem_type = 'cl_mem_type'
-
-
-class ModuleName(object):
-    YAML_CONFIG = 'YAML CONFIG'
-    MODEL_CONVERTER = 'Model Converter'
-    RUN = 'RUN'
-    BENCHMARK = 'Benchmark'
-
-
 CPP_KEYWORDS = [
     'alignas', 'alignof', 'and', 'and_eq', 'asm', 'atomic_cancel',
     'atomic_commit', 'atomic_noexcept', 'auto', 'bitand', 'bitor',
@@ -260,7 +152,7 @@ def parse_device_type(runtime):
 def get_hexagon_mode(configs):
     runtime_list = []
     for model_name in configs[YAMLKeyword.models]:
-        model_runtime =\
+        model_runtime = \
             configs[YAMLKeyword.models][model_name].get(
                 YAMLKeyword.runtime, "")
         runtime_list.append(model_runtime.lower())
@@ -273,7 +165,7 @@ def get_hexagon_mode(configs):
 def get_opencl_mode(configs):
     runtime_list = []
     for model_name in configs[YAMLKeyword.models]:
-        model_runtime =\
+        model_runtime = \
             configs[YAMLKeyword.models][model_name].get(
                 YAMLKeyword.runtime, "")
         runtime_list.append(model_runtime.lower())
@@ -331,7 +223,7 @@ def format_model_config(flags):
     target_socs = configs.get(YAMLKeyword.target_socs, "")
     if flags.target_socs:
         configs[YAMLKeyword.target_socs] = \
-               [soc.lower() for soc in flags.target_socs.split(',')]
+            [soc.lower() for soc in flags.target_socs.split(',')]
     elif not target_socs:
         configs[YAMLKeyword.target_socs] = []
     elif not isinstance(target_socs, list):
@@ -347,7 +239,9 @@ def format_model_config(flags):
         if ALL_SOC_TAG in target_socs:
             mace_check(available_socs,
                        ModuleName.YAML_CONFIG,
-                       "Build for all SOCs plugged in computer, "
+                       "Android abi is listed in config file and "
+                       "build for all SOCs plugged in computer, "
+                       "But no android phone found, "
                        "you at least plug in one phone")
         else:
             for soc in target_socs:
@@ -412,7 +306,7 @@ def format_model_config(flags):
 
         weight_file_path = model_config.get(YAMLKeyword.weight_file_path, "")
         if weight_file_path:
-            weight_checksum =\
+            weight_checksum = \
                 model_config.get(YAMLKeyword.weight_sha256_checksum, "")
             mace_check(weight_checksum != "", ModuleName.YAML_CONFIG,
                        "'%s' is necessary" %
@@ -538,14 +432,14 @@ def format_model_config(flags):
                 YAMLKeyword.validation_threshold, {})
             if not isinstance(validation_threshold, dict):
                 raise argparse.ArgumentTypeError(
-                        'similarity threshold must be a dict.')
+                    'similarity threshold must be a dict.')
 
             threshold_dict = {
-                    DeviceType.CPU: 0.999,
-                    DeviceType.GPU: 0.995,
-                    DeviceType.HEXAGON: 0.930,
-                    DeviceType.CPU + "_QUANTIZE": 0.980,
-                    }
+                DeviceType.CPU: 0.999,
+                DeviceType.GPU: 0.995,
+                DeviceType.HEXAGON: 0.930,
+                DeviceType.CPU + "_QUANTIZE": 0.980,
+            }
             for k, v in six.iteritems(validation_threshold):
                 if k.upper() == 'DSP':
                     k = DeviceType.HEXAGON
@@ -554,7 +448,7 @@ def format_model_config(flags):
                                      DeviceType.HEXAGON,
                                      DeviceType.CPU + "_QUANTIZE"):
                     raise argparse.ArgumentTypeError(
-                            'Unsupported validation threshold runtime: %s' % k)
+                        'Unsupported validation threshold runtime: %s' % k)
                 threshold_dict[k.upper()] = v
 
             subgraph[YAMLKeyword.validation_threshold] = threshold_dict
@@ -573,7 +467,7 @@ def format_model_config(flags):
                 subgraph[YAMLKeyword.input_ranges] = [input_ranges]
             else:
                 subgraph[YAMLKeyword.input_ranges] = input_ranges
-            subgraph[YAMLKeyword.input_ranges] =\
+            subgraph[YAMLKeyword.input_ranges] = \
                 [str(v) for v in subgraph[YAMLKeyword.input_ranges]]
 
         for key in [YAMLKeyword.limit_opencl_kernel_time,
@@ -598,67 +492,6 @@ def format_model_config(flags):
     return configs
 
 
-def get_build_binary_dir(library_name, target_abi):
-    return "%s/%s/%s/%s" % (
-        BUILD_OUTPUT_DIR, library_name, BUILD_TMP_DIR_NAME, target_abi)
-
-
-def get_build_model_dirs(library_name, model_name, target_abi, target_soc,
-                         serial_num, model_file_path):
-    model_path_digest = md5sum(model_file_path)
-    model_output_base_dir = "%s/%s/%s/%s/%s" % (
-        BUILD_OUTPUT_DIR, library_name, BUILD_TMP_DIR_NAME,
-        model_name, model_path_digest)
-
-    if target_abi == ABIType.host:
-        model_output_dir = "%s/%s" % (model_output_base_dir, target_abi)
-    elif not target_soc or not serial_num:
-        model_output_dir = "%s/%s/%s" % (
-            model_output_base_dir, BUILD_TMP_GENERAL_OUTPUT_DIR_NAME,
-            target_abi)
-    else:
-        device_name = \
-            sh_commands.adb_get_device_name_by_serialno(serial_num)
-        model_output_dir = "%s/%s_%s/%s" % (
-            model_output_base_dir, device_name,
-            target_soc, target_abi)
-
-    mace_model_dir = \
-        '%s/%s/%s' % (BUILD_OUTPUT_DIR, library_name, MODEL_OUTPUT_DIR_NAME)
-
-    return model_output_base_dir, model_output_dir, mace_model_dir
-
-
-def get_opencl_binary_output_path(library_name, target_abi,
-                                  target_soc, serial_num):
-    device_name = \
-        sh_commands.adb_get_device_name_by_serialno(serial_num)
-    return '%s/%s/%s/%s/%s_%s.%s.%s.bin' % \
-           (BUILD_OUTPUT_DIR,
-            library_name,
-            OUTPUT_OPENCL_BINARY_DIR_NAME,
-            target_abi,
-            library_name,
-            OUTPUT_OPENCL_BINARY_FILE_NAME,
-            device_name,
-            target_soc)
-
-
-def get_opencl_parameter_output_path(library_name, target_abi,
-                                     target_soc, serial_num):
-    device_name = \
-        sh_commands.adb_get_device_name_by_serialno(serial_num)
-    return '%s/%s/%s/%s/%s_%s.%s.%s.bin' % \
-           (BUILD_OUTPUT_DIR,
-            library_name,
-            OUTPUT_OPENCL_BINARY_DIR_NAME,
-            target_abi,
-            library_name,
-            OUTPUT_OPENCL_PARAMETER_FILE_NAME,
-            device_name,
-            target_soc)
-
-
 def clear_build_dirs(library_name):
     # make build dir
     if not os.path.exists(BUILD_OUTPUT_DIR):
@@ -676,27 +509,6 @@ def clear_build_dirs(library_name):
         sh.rm('-rf', lib_output_dir)
 
 
-def check_model_converted(library_name, model_name,
-                          model_graph_format, model_data_format,
-                          abi):
-    model_output_dir = \
-        '%s/%s/%s' % (BUILD_OUTPUT_DIR, library_name, MODEL_OUTPUT_DIR_NAME)
-    if model_graph_format == ModelFormat.file:
-        mace_check(os.path.exists("%s/%s.pb" % (model_output_dir, model_name)),
-                   ModuleName.RUN,
-                   "You should convert model first.")
-    else:
-        model_lib_path = get_model_lib_output_path(library_name, abi)
-        mace_check(os.path.exists(model_lib_path),
-                   ModuleName.RUN,
-                   "You should convert model first.")
-    if model_data_format == ModelFormat.file:
-        mace_check(os.path.exists("%s/%s.data" %
-                                  (model_output_dir, model_name)),
-                   ModuleName.RUN,
-                   "You should convert model first.")
-
-
 ################################
 # convert
 ################################
@@ -883,13 +695,6 @@ def convert_model(configs, cl_mem_type):
             StringFormatter.block("Model %s converted" % model_name))
 
 
-def get_model_lib_output_path(library_name, abi):
-    lib_output_path = os.path.join(BUILD_OUTPUT_DIR, library_name,
-                                   MODEL_OUTPUT_DIR_NAME, abi,
-                                   "%s.a" % library_name)
-    return lib_output_path
-
-
 def build_model_lib(configs, address_sanitizer):
     MaceLogger.header(StringFormatter.block("Building model library"))
 
@@ -902,10 +707,11 @@ def build_model_lib(configs, address_sanitizer):
         library_out_dir = os.path.dirname(model_lib_output_path)
         if not os.path.exists(library_out_dir):
             os.makedirs(library_out_dir)
-
+        toolchain = infer_toolchain(target_abi)
         sh_commands.bazel_build(
             MODEL_LIB_TARGET,
             abi=target_abi,
+            toolchain=toolchain,
             hexagon_mode=hexagon_mode,
             enable_opencl=get_opencl_mode(configs),
             enable_quantize=get_quantize_mode(configs),
@@ -994,8 +800,8 @@ def report_run_statistics(stdout,
         f.write(data_str)
 
 
-def build_mace_run(configs, target_abi, enable_openmp, address_sanitizer,
-                   mace_lib_type):
+def build_mace_run(configs, target_abi, toolchain, enable_openmp,
+                   address_sanitizer, mace_lib_type):
     library_name = configs[YAMLKeyword.library_name]
     hexagon_mode = get_hexagon_mode(configs)
 
@@ -1019,6 +825,7 @@ def build_mace_run(configs, target_abi, enable_openmp, address_sanitizer,
     sh_commands.bazel_build(
         mace_run_target,
         abi=target_abi,
+        toolchain=toolchain,
         hexagon_mode=hexagon_mode,
         enable_openmp=enable_openmp,
         enable_opencl=get_opencl_mode(configs),
@@ -1031,8 +838,41 @@ def build_mace_run(configs, target_abi, enable_openmp, address_sanitizer,
                                        mace_lib_type == MACELibType.dynamic)
 
 
-def build_example(configs, target_abi, enable_openmp, address_sanitizer,
-                  mace_lib_type):
+def build_quantize_stat(configs):
+    library_name = configs[YAMLKeyword.library_name]
+
+    build_tmp_binary_dir = get_build_binary_dir(library_name, ABIType.host)
+    if os.path.exists(build_tmp_binary_dir):
+        sh.rm("-rf", build_tmp_binary_dir)
+    os.makedirs(build_tmp_binary_dir)
+
+    quantize_stat_target = QUANTIZE_STAT_TARGET
+    build_arg = ""
+    six.print_(configs[YAMLKeyword.model_graph_format])
+    if configs[YAMLKeyword.model_graph_format] == ModelFormat.code:
+        mace_check(os.path.exists(ENGINE_CODEGEN_DIR),
+                   ModuleName.RUN,
+                   "You should convert model first.")
+        build_arg = "--per_file_copt=mace/tools/quantization/quantize_stat.cc@-DMODEL_GRAPH_FORMAT_CODE"  # noqa
+
+    sh_commands.bazel_build(
+        quantize_stat_target,
+        abi=ABIType.host,
+        toolchain=flags.toolchain,
+        enable_openmp=True,
+        symbol_hidden=True,
+        extra_args=build_arg
+    )
+
+    quantize_stat_filepath = build_tmp_binary_dir + "/quantize_stat"
+    if os.path.exists(quantize_stat_filepath):
+        sh.rm("-rf", quantize_stat_filepath)
+    sh.cp("-f", "bazel-bin/mace/tools/quantization/quantize_stat",
+          build_tmp_binary_dir)
+
+
+def build_example(configs, target_abi, toolchain,
+                  enable_openmp, mace_lib_type):
     library_name = configs[YAMLKeyword.library_name]
     hexagon_mode = get_hexagon_mode(configs)
 
@@ -1042,6 +882,7 @@ def build_example(configs, target_abi, enable_openmp, address_sanitizer,
     os.makedirs(build_tmp_binary_dir)
 
     symbol_hidden = True
+
     libmace_target = LIBMACE_STATIC_TARGET
     if mace_lib_type == MACELibType.dynamic:
         symbol_hidden = False
@@ -1049,11 +890,12 @@ def build_example(configs, target_abi, enable_openmp, address_sanitizer,
 
     sh_commands.bazel_build(libmace_target,
                             abi=target_abi,
+                            toolchain=toolchain,
                             enable_openmp=enable_openmp,
                             enable_opencl=get_opencl_mode(configs),
                             enable_quantize=get_quantize_mode(configs),
                             hexagon_mode=hexagon_mode,
-                            address_sanitizer=address_sanitizer,
+                            address_sanitizer=flags.address_sanitizer,
                             symbol_hidden=symbol_hidden)
 
     if os.path.exists(LIB_CODEGEN_DIR):
@@ -1079,11 +921,12 @@ def build_example(configs, target_abi, enable_openmp, address_sanitizer,
 
     sh_commands.bazel_build(example_target,
                             abi=target_abi,
+                            toolchain=toolchain,
                             enable_openmp=enable_openmp,
                             enable_opencl=get_opencl_mode(configs),
                             enable_quantize=get_quantize_mode(configs),
                             hexagon_mode=hexagon_mode,
-                            address_sanitizer=address_sanitizer,
+                            address_sanitizer=flags.address_sanitizer,
                             extra_args=build_arg)
 
     target_bin = "/".join(sh_commands.bazel_target_to_bin(example_target))
@@ -1092,296 +935,6 @@ def build_example(configs, target_abi, enable_openmp, address_sanitizer,
         sh.rm("-rf", LIB_CODEGEN_DIR)
 
 
-def tuning(library_name, model_name, model_config,
-           model_graph_format, model_data_format,
-           target_abi, target_soc, serial_num,
-           mace_lib_type):
-    six.print_('* Tuning, it may take some time...')
-
-    build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
-    mace_run_name = MACE_RUN_STATIC_NAME
-    link_dynamic = False
-    if mace_lib_type == MACELibType.dynamic:
-        mace_run_name = MACE_RUN_DYNAMIC_NAME
-        link_dynamic = True
-
-    embed_model_data = model_data_format == ModelFormat.code
-
-    model_output_base_dir, model_output_dir, mace_model_dir = \
-        get_build_model_dirs(library_name, model_name, target_abi,
-                             target_soc, serial_num,
-                             model_config[YAMLKeyword.model_file_path])
-
-    # build for specified soc
-    sh_commands.clear_phone_data_dir(serial_num, PHONE_DATA_DIR)
-
-    subgraphs = model_config[YAMLKeyword.subgraphs]
-    # generate input data
-    sh_commands.gen_random_input(
-        model_output_dir,
-        subgraphs[0][YAMLKeyword.input_tensors],
-        subgraphs[0][YAMLKeyword.input_shapes],
-        subgraphs[0][YAMLKeyword.validation_inputs_data],
-        input_ranges=subgraphs[0][YAMLKeyword.input_ranges],
-        input_data_types=subgraphs[0][YAMLKeyword.input_data_types])
-
-    sh_commands.tuning_run(
-        abi=target_abi,
-        serialno=serial_num,
-        target_dir=build_tmp_binary_dir,
-        target_name=mace_run_name,
-        vlog_level=0,
-        embed_model_data=embed_model_data,
-        model_output_dir=model_output_dir,
-        input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
-        output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
-        input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
-        output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
-        mace_model_dir=mace_model_dir,
-        model_tag=model_name,
-        device_type=DeviceType.GPU,
-        running_round=0,
-        restart_round=1,
-        limit_opencl_kernel_time=model_config[YAMLKeyword.limit_opencl_kernel_time],  # noqa
-        tuning=True,
-        out_of_range_check=False,
-        phone_data_dir=PHONE_DATA_DIR,
-        model_graph_format=model_graph_format,
-        opencl_binary_file="",
-        opencl_parameter_file="",
-        libmace_dynamic_library_path=LIBMACE_DYNAMIC_PATH,
-        link_dynamic=link_dynamic,
-    )
-    # pull opencl binary
-    sh_commands.pull_file_from_device(
-        serial_num,
-        DEVICE_INTERIOR_DIR,
-        CL_COMPILED_BINARY_FILE_NAME,
-        "%s/%s" % (model_output_dir, BUILD_TMP_OPENCL_BIN_DIR))
-
-    # pull opencl parameter
-    sh_commands.pull_file_from_device(
-        serial_num,
-        PHONE_DATA_DIR,
-        CL_TUNED_PARAMETER_FILE_NAME,
-        "%s/%s" % (model_output_dir, BUILD_TMP_OPENCL_BIN_DIR))
-
-    six.print_('Tuning done\n')
-
-
-def run_specific_target(flags, configs, target_abi,
-                        target_soc, serial_num):
-    library_name = configs[YAMLKeyword.library_name]
-    mace_lib_type = flags.mace_lib_type
-    embed_model_data = \
-        configs[YAMLKeyword.model_data_format] == ModelFormat.code
-    build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
-
-    # get target name for run
-    if flags.example:
-        if mace_lib_type == MACELibType.static:
-            target_name = EXAMPLE_STATIC_NAME
-        else:
-            target_name = EXAMPLE_DYNAMIC_NAME
-    else:
-        if mace_lib_type == MACELibType.static:
-            target_name = MACE_RUN_STATIC_NAME
-        else:
-            target_name = MACE_RUN_DYNAMIC_NAME
-
-    link_dynamic = mace_lib_type == MACELibType.dynamic
-    model_output_dirs = []
-
-    for model_name in configs[YAMLKeyword.models]:
-        check_model_converted(library_name, model_name,
-                              configs[YAMLKeyword.model_graph_format],
-                              configs[YAMLKeyword.model_data_format],
-                              target_abi)
-        if target_abi == ABIType.host:
-            device_name = ABIType.host
-        else:
-            device_name = \
-                sh_commands.adb_get_device_name_by_serialno(serial_num)
-            sh_commands.clear_phone_data_dir(serial_num, PHONE_DATA_DIR)
-
-        MaceLogger.header(
-            StringFormatter.block(
-                "Run model %s on %s" % (model_name, device_name)))
-
-        model_config = configs[YAMLKeyword.models][model_name]
-        model_runtime = model_config[YAMLKeyword.runtime]
-        subgraphs = model_config[YAMLKeyword.subgraphs]
-
-        if not configs[YAMLKeyword.target_socs] or target_abi == ABIType.host:
-            model_output_base_dir, model_output_dir, mace_model_dir = \
-                get_build_model_dirs(library_name, model_name, target_abi,
-                                     None, None,
-                                     model_config[YAMLKeyword.model_file_path])
-        else:
-            model_output_base_dir, model_output_dir, mace_model_dir = \
-                get_build_model_dirs(library_name, model_name, target_abi,
-                                     target_soc, serial_num,
-                                     model_config[YAMLKeyword.model_file_path])
-        # clear temp model output dir
-        if os.path.exists(model_output_dir):
-            sh.rm("-rf", model_output_dir)
-        os.makedirs(model_output_dir)
-
-        is_tuned = False
-        model_opencl_output_bin_path = ""
-        model_opencl_parameter_path = ""
-        # tuning for specified soc
-        if not flags.address_sanitizer \
-                and not flags.example \
-                and target_abi != ABIType.host \
-                and configs[YAMLKeyword.target_socs] \
-                and target_soc \
-                and model_runtime in [RuntimeType.gpu, RuntimeType.cpu_gpu] \
-                and not flags.disable_tuning:
-            tuning(library_name, model_name, model_config,
-                   configs[YAMLKeyword.model_graph_format],
-                   configs[YAMLKeyword.model_data_format],
-                   target_abi, target_soc, serial_num,
-                   mace_lib_type)
-            model_output_dirs.append(model_output_dir)
-            model_opencl_output_bin_path =\
-                "%s/%s/%s" % (model_output_dir,
-                              BUILD_TMP_OPENCL_BIN_DIR,
-                              CL_COMPILED_BINARY_FILE_NAME)
-            model_opencl_parameter_path = \
-                "%s/%s/%s" % (model_output_dir,
-                              BUILD_TMP_OPENCL_BIN_DIR,
-                              CL_TUNED_PARAMETER_FILE_NAME)
-            sh_commands.clear_phone_data_dir(serial_num, PHONE_DATA_DIR)
-            is_tuned = True
-        elif target_abi != ABIType.host and target_soc:
-            model_opencl_output_bin_path = get_opencl_binary_output_path(
-                library_name, target_abi, target_soc, serial_num
-            )
-            model_opencl_parameter_path = get_opencl_parameter_output_path(
-                library_name, target_abi, target_soc, serial_num
-            )
-
-        # generate input data
-        sh_commands.gen_random_input(
-            model_output_dir,
-            subgraphs[0][YAMLKeyword.input_tensors],
-            subgraphs[0][YAMLKeyword.input_shapes],
-            subgraphs[0][YAMLKeyword.validation_inputs_data],
-            input_ranges=subgraphs[0][YAMLKeyword.input_ranges],
-            input_data_types=subgraphs[0][YAMLKeyword.input_data_types])
-
-        runtime_list = []
-        if target_abi == ABIType.host:
-            runtime_list.extend([RuntimeType.cpu])
-        elif model_runtime == RuntimeType.cpu_gpu:
-            runtime_list.extend([RuntimeType.cpu, RuntimeType.gpu])
-        else:
-            runtime_list.extend([model_runtime])
-        for runtime in runtime_list:
-            device_type = parse_device_type(runtime)
-            # run for specified soc
-            if not subgraphs[0][YAMLKeyword.check_tensors]:
-                output_nodes = subgraphs[0][YAMLKeyword.output_tensors]
-                output_shapes = subgraphs[0][YAMLKeyword.output_shapes]
-            else:
-                output_nodes = subgraphs[0][YAMLKeyword.check_tensors]
-                output_shapes = subgraphs[0][YAMLKeyword.check_shapes]
-            run_output = sh_commands.tuning_run(
-                abi=target_abi,
-                serialno=serial_num,
-                target_dir=build_tmp_binary_dir,
-                target_name=target_name,
-                vlog_level=flags.vlog_level,
-                embed_model_data=embed_model_data,
-                model_output_dir=model_output_dir,
-                input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
-                output_nodes=output_nodes,
-                input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
-                output_shapes=output_shapes,
-                mace_model_dir=mace_model_dir,
-                model_tag=model_name,
-                device_type=device_type,
-                running_round=flags.round,
-                restart_round=flags.restart_round,
-                limit_opencl_kernel_time=model_config[YAMLKeyword.limit_opencl_kernel_time],  # noqa
-                tuning=False,
-                out_of_range_check=flags.gpu_out_of_range_check,
-                phone_data_dir=PHONE_DATA_DIR,
-                model_graph_format=configs[YAMLKeyword.model_graph_format],
-                omp_num_threads=flags.omp_num_threads,
-                cpu_affinity_policy=flags.cpu_affinity_policy,
-                gpu_perf_hint=flags.gpu_perf_hint,
-                gpu_priority_hint=flags.gpu_priority_hint,
-                input_dir=flags.input_dir,
-                output_dir=flags.output_dir,
-                runtime_failure_ratio=flags.runtime_failure_ratio,
-                address_sanitizer=flags.address_sanitizer,
-                opencl_binary_file=model_opencl_output_bin_path,
-                opencl_parameter_file=model_opencl_parameter_path,
-                libmace_dynamic_library_path=LIBMACE_DYNAMIC_PATH,
-                link_dynamic=link_dynamic,
-                quantize_stat=flags.quantize_stat,
-            )
-            if flags.validate:
-                model_file_path, weight_file_path = get_model_files(
-                    model_config[YAMLKeyword.model_file_path],
-                    model_config[YAMLKeyword.model_sha256_checksum],
-                    BUILD_DOWNLOADS_DIR,
-                    model_config[YAMLKeyword.weight_file_path],
-                    model_config[YAMLKeyword.weight_sha256_checksum])
-
-                validate_type = device_type
-                if model_config[YAMLKeyword.quantize] == 1 \
-                        and device_type == DeviceType.CPU:
-                    validate_type = device_type + "_QUANTIZE"
-
-                sh_commands.validate_model(
-                    abi=target_abi,
-                    serialno=serial_num,
-                    model_file_path=model_file_path,
-                    weight_file_path=weight_file_path,
-                    platform=model_config[YAMLKeyword.platform],
-                    device_type=device_type,
-                    input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
-                    output_nodes=output_nodes,
-                    input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
-                    output_shapes=output_shapes,
-                    model_output_dir=model_output_dir,
-                    phone_data_dir=PHONE_DATA_DIR,
-                    input_data_types=subgraphs[0][YAMLKeyword.input_data_types],  # noqa
-                    caffe_env=flags.caffe_env,
-                    validation_threshold=subgraphs[0][YAMLKeyword.validation_threshold][validate_type])  # noqa
-            if flags.report and flags.round > 0:
-                tuned = is_tuned and device_type == DeviceType.GPU
-                report_run_statistics(
-                    run_output, target_abi, serial_num,
-                    model_name, device_type, flags.report_dir,
-                    tuned)
-
-    if model_output_dirs:
-        opencl_output_bin_path = get_opencl_binary_output_path(
-            library_name, target_abi, target_soc, serial_num
-        )
-        opencl_parameter_bin_path = get_opencl_parameter_output_path(
-            library_name, target_abi, target_soc, serial_num
-        )
-        # clear opencl output dir
-        if os.path.exists(opencl_output_bin_path):
-            sh.rm('-rf', opencl_output_bin_path)
-        if os.path.exists(opencl_parameter_bin_path):
-            sh.rm('-rf', opencl_parameter_bin_path)
-
-        # merge all models' OpenCL binaries together
-        sh_commands.merge_opencl_binaries(
-            model_output_dirs, CL_COMPILED_BINARY_FILE_NAME,
-            opencl_output_bin_path)
-        # merge all models' OpenCL parameters together
-        sh_commands.merge_opencl_parameters(
-            model_output_dirs, CL_TUNED_PARAMETER_FILE_NAME,
-            opencl_parameter_bin_path)
-
-
 def print_package_summary(package_path):
     title = "Library"
     header = ["key", "value"]
@@ -1399,35 +952,38 @@ def run_mace(flags):
 
     target_socs = configs[YAMLKeyword.target_socs]
     if not target_socs or ALL_SOC_TAG in target_socs:
-        target_socs = sh_commands.adb_get_all_socs()
-
+        device_list = DeviceManager.list_devices(flags.device_yml)
+    else:
+        device_list = DeviceManager.list_devices(flags.device_yml)
+        device_list = [dev for dev in device_list
+                       if dev[YAMLKeyword.target_socs].lower() in target_socs]
     for target_abi in configs[YAMLKeyword.target_abis]:
         # build target
-        if flags.example:
-            build_example(configs, target_abi,
-                          not flags.disable_openmp,
-                          flags.address_sanitizer,
-                          flags.mace_lib_type)
-        else:
-            build_mace_run(configs, target_abi,
-                           not flags.disable_openmp,
-                           flags.address_sanitizer,
-                           flags.mace_lib_type)
-
-        # run
-        if target_abi == ABIType.host:
-            run_specific_target(flags, configs, target_abi, None, None)
-        else:
-            for target_soc in target_socs:
-                serial_nums = \
-                    sh_commands.get_target_socs_serialnos([target_soc])
-                mace_check(serial_nums,
-                           ModuleName.RUN,
-                           'There is no device with soc: ' + target_soc)
-                for serial_num in serial_nums:
-                    with sh_commands.device_lock(serial_num):
-                        run_specific_target(flags, configs, target_abi,
-                                            target_soc, serial_num)
+        for dev in device_list:
+            if target_abi in dev[YAMLKeyword.target_abis]:
+                # get toolchain
+                toolchain = infer_toolchain(target_abi)
+                if flags.example:
+                    build_example(configs,
+                                  target_abi,
+                                  toolchain,
+                                  not flags.disable_openmp,
+                                  flags.mace_lib_type)
+                else:
+                    build_mace_run(configs,
+                                   target_abi,
+                                   toolchain,
+                                   not flags.disable_openmp,
+                                   flags.address_sanitizer,
+                                   flags.mace_lib_type)
+                # run
+                device = DeviceWrapper(dev)
+                with device.lock():
+                    device.run_specify_abi(flags, configs, target_abi)
+            elif dev[YAMLKeyword.device_name] != SystemType.host:
+                six.print_('The device with soc %s do not support abi %s' %
+                           (dev[YAMLKeyword.target_socs], target_abi),
+                           file=sys.stderr)
 
     # package the output files
     package_path = sh_commands.packaging_lib(BUILD_OUTPUT_DIR,
@@ -1438,7 +994,11 @@ def run_mace(flags):
 ################################
 #  benchmark model
 ################################
-def build_benchmark_model(configs, target_abi, enable_openmp, mace_lib_type):
+def build_benchmark_model(configs,
+                          target_abi,
+                          toolchain,
+                          enable_openmp,
+                          mace_lib_type):
     library_name = configs[YAMLKeyword.library_name]
     hexagon_mode = get_hexagon_mode(configs)
 
@@ -1459,6 +1019,7 @@ def build_benchmark_model(configs, target_abi, enable_openmp, mace_lib_type):
 
     sh_commands.bazel_build(benchmark_target,
                             abi=target_abi,
+                            toolchain=toolchain,
                             enable_openmp=enable_openmp,
                             enable_opencl=get_opencl_mode(configs),
                             enable_quantize=get_quantize_mode(configs),
@@ -1475,105 +1036,6 @@ def build_benchmark_model(configs, target_abi, enable_openmp, mace_lib_type):
     sh.cp("-f", target_bin, build_tmp_binary_dir)
 
 
-def bm_specific_target(flags, configs, target_abi, target_soc, serial_num):
-    library_name = configs[YAMLKeyword.library_name]
-    embed_model_data = \
-        configs[YAMLKeyword.model_data_format] == ModelFormat.code
-    opencl_output_bin_path = ""
-    opencl_parameter_path = ""
-    link_dynamic = flags.mace_lib_type == MACELibType.dynamic
-
-    if link_dynamic:
-        bm_model_binary_name = BM_MODEL_DYNAMIC_NAME
-    else:
-        bm_model_binary_name = BM_MODEL_STATIC_NAME
-    build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
-
-    if configs[YAMLKeyword.target_socs] and target_abi != ABIType.host:
-        opencl_output_bin_path = get_opencl_binary_output_path(
-            library_name, target_abi, target_soc, serial_num
-        )
-        opencl_parameter_path = get_opencl_parameter_output_path(
-            library_name, target_abi, target_soc, serial_num
-        )
-
-    for model_name in configs[YAMLKeyword.models]:
-        check_model_converted(library_name, model_name,
-                              configs[YAMLKeyword.model_graph_format],
-                              configs[YAMLKeyword.model_data_format],
-                              target_abi)
-        if target_abi == ABIType.host:
-            device_name = ABIType.host
-        else:
-            device_name = \
-                sh_commands.adb_get_device_name_by_serialno(serial_num)
-        MaceLogger.header(
-            StringFormatter.block(
-                "Benchmark model %s on %s" % (model_name, device_name)))
-        model_config = configs[YAMLKeyword.models][model_name]
-        model_runtime = model_config[YAMLKeyword.runtime]
-        subgraphs = model_config[YAMLKeyword.subgraphs]
-
-        if not configs[YAMLKeyword.target_socs] or target_abi == ABIType.host:
-            model_output_base_dir, model_output_dir, mace_model_dir = \
-                get_build_model_dirs(library_name, model_name, target_abi,
-                                     None, None,
-                                     model_config[YAMLKeyword.model_file_path])
-        else:
-            model_output_base_dir, model_output_dir, mace_model_dir = \
-                get_build_model_dirs(library_name, model_name, target_abi,
-                                     target_soc, serial_num,
-                                     model_config[YAMLKeyword.model_file_path])
-        if os.path.exists(model_output_dir):
-            sh.rm("-rf", model_output_dir)
-        os.makedirs(model_output_dir)
-
-        if target_abi != ABIType.host:
-            sh_commands.clear_phone_data_dir(serial_num, PHONE_DATA_DIR)
-
-        sh_commands.gen_random_input(
-            model_output_dir,
-            subgraphs[0][YAMLKeyword.input_tensors],
-            subgraphs[0][YAMLKeyword.input_shapes],
-            subgraphs[0][YAMLKeyword.validation_inputs_data],
-            input_ranges=subgraphs[0][YAMLKeyword.input_ranges],
-            input_data_types=subgraphs[0][YAMLKeyword.input_data_types])
-        runtime_list = []
-        if target_abi == ABIType.host:
-            runtime_list.extend([RuntimeType.cpu])
-        elif model_runtime == RuntimeType.cpu_gpu:
-            runtime_list.extend([RuntimeType.cpu, RuntimeType.gpu])
-        else:
-            runtime_list.extend([model_runtime])
-        for runtime in runtime_list:
-            device_type = parse_device_type(runtime)
-            sh_commands.benchmark_model(
-                abi=target_abi,
-                serialno=serial_num,
-                benchmark_binary_dir=build_tmp_binary_dir,
-                benchmark_binary_name=bm_model_binary_name,
-                vlog_level=0,
-                embed_model_data=embed_model_data,
-                model_output_dir=model_output_dir,
-                input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
-                output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
-                input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
-                output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
-                mace_model_dir=mace_model_dir,
-                model_tag=model_name,
-                device_type=device_type,
-                phone_data_dir=PHONE_DATA_DIR,
-                model_graph_format=configs[YAMLKeyword.model_graph_format],
-                omp_num_threads=flags.omp_num_threads,
-                cpu_affinity_policy=flags.cpu_affinity_policy,
-                gpu_perf_hint=flags.gpu_perf_hint,
-                gpu_priority_hint=flags.gpu_priority_hint,
-                opencl_binary_file=opencl_output_bin_path,
-                opencl_parameter_file=opencl_parameter_path,
-                libmace_dynamic_library_path=LIBMACE_DYNAMIC_PATH,
-                link_dynamic=link_dynamic)
-
-
 def benchmark_model(flags):
     configs = format_model_config(flags)
 
@@ -1581,27 +1043,30 @@ def benchmark_model(flags):
 
     target_socs = configs[YAMLKeyword.target_socs]
     if not target_socs or ALL_SOC_TAG in target_socs:
-        target_socs = sh_commands.adb_get_all_socs()
+        device_list = DeviceManager.list_devices(flags.device_yml)
+        # target_socs = sh_commands.adb_get_all_socs()
+    else:
+        device_list = DeviceManager.list_devices(flags.device_yml)
+        device_list = [dev for dev in device_list
+                       if dev[YAMLKeyword.target_socs] in target_socs]
 
     for target_abi in configs[YAMLKeyword.target_abis]:
         # build benchmark_model binary
-        build_benchmark_model(configs, target_abi,
-                              not flags.disable_openmp,
-                              flags.mace_lib_type)
-
-        if target_abi == ABIType.host:
-            bm_specific_target(flags, configs, target_abi, None, None)
-        else:
-            for target_soc in target_socs:
-                serial_nums = \
-                    sh_commands.get_target_socs_serialnos([target_soc])
-                mace_check(serial_nums,
-                           ModuleName.BENCHMARK,
-                           'There is no device with soc: ' + target_soc)
-                for serial_num in serial_nums:
-                    with sh_commands.device_lock(serial_num):
-                        bm_specific_target(flags, configs, target_abi,
-                                           target_soc, serial_num)
+        for dev in device_list:
+            if target_abi in dev[YAMLKeyword.target_abis]:
+                toolchain = infer_toolchain(target_abi)
+                build_benchmark_model(configs,
+                                      target_abi,
+                                      toolchain,
+                                      not flags.disable_openmp,
+                                      flags.mace_lib_type)
+                device = DeviceWrapper(dev)
+                with device.lock():
+                    device.bm_specific_target(flags, configs, target_abi)
+            else:
+                six.print_('There is no abi %s with soc %s' %
+                           (target_abi, dev[YAMLKeyword.target_socs]),
+                           file=sys.stderr)
 
 
 ################################
@@ -1698,7 +1163,12 @@ def parse_args():
         type=int,
         default=DefaultValues.gpu_priority_hint,
         help="0:DEFAULT/1:LOW/2:NORMAL/3:HIGH")
-
+    run_bm_parent_parser.add_argument(
+        "--device_yml",
+        type=str,
+        default='',
+        help='embedded linux device config yml file'
+    )
     parser = argparse.ArgumentParser()
     subparsers = parser.add_subparsers()
     convert = subparsers.add_parser(
diff --git a/tools/device.py b/tools/device.py
new file mode 100644
index 00000000..d04cfa64
--- /dev/null
+++ b/tools/device.py
@@ -0,0 +1,1004 @@
+# Copyright 2018 Xiaomi, Inc.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import socket
+import subprocess
+import time
+
+import six
+import sh
+import yaml
+
+import common
+from common import *
+
+import sh_commands
+
+
+class DeviceWrapper:
+    allow_scheme = ('ssh', 'adb')
+
+    def __init__(self, device_dict):
+        """
+        init device with device dict info
+        :type device_dict: Device
+        :param device_dict: a key-value dict that holds the device information,
+                       which attribute has:
+                       target_abis, target_socs, models, system, address
+                       password, username
+        """
+        diff = set(device_dict.keys()) - set(YAMLKeyword.__dict__.keys())
+        if len(diff) > 0:
+            six.print_('Wrong key detected: ')
+            six.print_(diff)
+            raise KeyError(str(diff))
+        self.__dict__.update(device_dict)
+        if self.system == SystemType.android:
+            self.data_dir = PHONE_DATA_DIR
+            self.interior_dir = self.data_dir + '/interior'
+        elif self.system == SystemType.arm_linux:
+            try:
+                sh.ssh('-q', '{}@{}'.format(self.username, self.address),
+                       'exit')
+            except sh.ErrorReturnCode as e:
+                six.print_('device connect failed, '
+                           'please check your authentication')
+                raise e
+            self.data_dir = DEVICE_DATA_DIR
+            self.interior_dir = self.data_dir + '/interior'
+
+    ##################
+    #  internal use  #
+    ##################
+
+    def exec_command(self, command, *args, **kwargs):
+        if self.system == SystemType.android:
+            sh.adb('-s', self.address, 'shell', command, *args, **kwargs)
+        elif self.system == SystemType.arm_linux:
+            sh.ssh('{}@{}'.format(self.username, self.address),
+                   command, *args, **kwargs)
+
+    #####################
+    #  public interface #
+    #####################
+
+    def is_lock(self):
+        return sh_commands.is_device_locked(self.address)
+
+    def lock(self):
+        return sh_commands.device_lock(self.address)
+
+    def clear_data_dir(self):
+        if self.system == SystemType.android:
+            sh_commands.clear_phone_data_dir(self.address, PHONE_DATA_DIR)
+        elif self.system == SystemType.arm_linux:
+            self.exec_command('rm -rf {}'.format(self.data_dir))
+
+    def pull_from_data_dir(self, filename, dst_path):
+        if self.system == SystemType.android:
+            self.pull(PHONE_DATA_DIR, filename, dst_path)
+        elif self.system == SystemType.arm_linux:
+            self.pull(DEVICE_DATA_DIR, filename, dst_path)
+
+    def create_internal_storage_dir(self):
+        internal_storage_dir = '{}/interior/'.format(self.data_dir)
+        if self.system == SystemType.android:
+            sh_commands.create_internal_storage_dir(self.address,
+                                                    internal_storage_dir)
+        elif self.system == SystemType.arm_linux:
+            self.exec_command('mkdir -p {}'.format(internal_storage_dir))
+        return internal_storage_dir
+
+    def rm(self, file):
+        if self.system == SystemType.android:
+            sh.adb('-s', self.address, 'shell', 'rm', '-rf', file, _fg=True)
+        elif self.system == SystemType.arm_linux:
+            self.exec_command('rm -rf {}'.format(file), _fg=True)
+
+    def push(self, src_path, dst_path):
+        mace_check(os.path.exists(src_path), "Device",
+                   '{} not found'.format(src_path))
+        if self.system == SystemType.android:
+            sh_commands.adb_push(src_path, dst_path, self.address)
+        elif self.system == SystemType.arm_linux:
+            try:
+                sh.scp(src_path, '{}@{}:{}'.format(self.username,
+                                                   self.address,
+                                                   dst_path))
+            except sh.ErrorReturnCode_1 as e:
+                six.print_('Push Failed !', e, file=sys.stderr)
+                raise e
+
+    def pull(self, src_path, file_name, dst_path='.'):
+        if not os.path.exists(dst_path):
+            sh.mkdir("-p", dst_path)
+        src_file = "%s/%s" % (src_path, file_name)
+        dst_file = "%s/%s" % (dst_path, file_name)
+        if os.path.exists(dst_file):
+            sh.rm('-f', dst_file)
+        if self.system == SystemType.android:
+            sh_commands.adb_pull(
+                src_file, dst_file, self.address)
+        elif self.system == SystemType.arm_linux:
+            try:
+                sh.scp('-r', '%s@%s:%s' % (self.username,
+                                           self.address,
+                                           src_file),
+                       dst_file)
+                print("pull file ", src_path, dst_path)
+            except sh.ErrorReturnCode_1 as e:
+                six.print_("Pull Failed !", file=sys.stderr)
+                raise e
+
+    def tuning_run(self,
+                   abi,
+                   target_dir,
+                   target_name,
+                   vlog_level,
+                   embed_model_data,
+                   model_output_dir,
+                   input_nodes,
+                   output_nodes,
+                   input_shapes,
+                   output_shapes,
+                   mace_model_dir,
+                   model_tag,
+                   device_type,
+                   running_round,
+                   restart_round,
+                   limit_opencl_kernel_time,
+                   tuning,
+                   out_of_range_check,
+                   model_graph_format,
+                   opencl_binary_file,
+                   opencl_parameter_file,
+                   libmace_dynamic_library_path,
+                   omp_num_threads=-1,
+                   cpu_affinity_policy=1,
+                   gpu_perf_hint=3,
+                   gpu_priority_hint=3,
+                   input_file_name='model_input',
+                   output_file_name='model_out',
+                   runtime_failure_ratio=0.0,
+                   address_sanitizer=False,
+                   link_dynamic=False
+                   ):
+        six.print_("* Run '%s' with round=%s, restart_round=%s, tuning=%s, "
+                   "out_of_range_check=%s, omp_num_threads=%s, "
+                   "cpu_affinity_policy=%s, gpu_perf_hint=%s, "
+                   "gpu_priority_hint=%s" %
+                   (model_tag, running_round, restart_round, str(tuning),
+                    str(out_of_range_check), omp_num_threads,
+                    cpu_affinity_policy, gpu_perf_hint, gpu_priority_hint))
+        mace_model_path = ""
+        if model_graph_format == ModelFormat.file:
+            mace_model_path = "%s/%s.pb" % (mace_model_dir, model_tag)
+        if self.system == SystemType.host:
+            libmace_dynamic_lib_path = \
+                os.path.dirname(libmace_dynamic_library_path)
+            p = subprocess.Popen(
+                [
+                    "env",
+                    "LD_LIBRARY_PATH=%s" % libmace_dynamic_lib_path,
+                    "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
+                    "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
+                    "%s/%s" % (target_dir, target_name),
+                    "--model_name=%s" % model_tag,
+                    "--input_node=%s" % ",".join(input_nodes),
+                    "--output_node=%s" % ",".join(output_nodes),
+                    "--input_shape=%s" % ":".join(input_shapes),
+                    "--output_shape=%s" % ":".join(output_shapes),
+                    "--input_file=%s/%s" % (model_output_dir,
+                                            input_file_name),
+                    "--output_file=%s/%s" % (model_output_dir,
+                                             output_file_name),
+                    "--model_data_file=%s/%s.data" % (mace_model_dir,
+                                                      model_tag),
+                    "--device=%s" % device_type,
+                    "--round=%s" % running_round,
+                    "--restart_round=%s" % restart_round,
+                    "--omp_num_threads=%s" % omp_num_threads,
+                    "--cpu_affinity_policy=%s" % cpu_affinity_policy,
+                    "--gpu_perf_hint=%s" % gpu_perf_hint,
+                    "--gpu_priority_hint=%s" % gpu_priority_hint,
+                    "--model_file=%s" % mace_model_path,
+                ],
+                stderr=subprocess.PIPE,
+                stdout=subprocess.PIPE)
+            out, err = p.communicate()
+            self.stdout = err + out
+            six.print_(self.stdout)
+            six.print_("Running finished!\n")
+        elif self.system in [SystemType.android, SystemType.arm_linux]:
+            self.rm(self.data_dir)
+            self.exec_command('mkdir -p {}'.format(self.data_dir))
+            internal_storage_dir = self.create_internal_storage_dir()
+
+            for input_name in input_nodes:
+                formatted_name = common.formatted_file_name(input_file_name,
+                                                            input_name)
+                self.push("%s/%s" % (model_output_dir, formatted_name),
+                          self.data_dir)
+            if self.system == SystemType.android and address_sanitizer:
+                self.push(sh_commands.find_asan_rt_library(abi),
+                          self.data_dir)
+
+            if not embed_model_data:
+                model_data_path = "%s/%s.data" % (mace_model_dir, model_tag)
+                mace_check(os.path.exists(model_data_path), "Device",
+                           'model data file not found,'
+                           ' please convert model first')
+                self.push(model_data_path, self.data_dir)
+
+            if device_type == common.DeviceType.GPU:
+                if os.path.exists(opencl_binary_file):
+                    self.push(opencl_binary_file, self.data_dir)
+                if os.path.exists(opencl_parameter_file):
+                    self.push(opencl_parameter_file, self.data_dir)
+
+            self.push("third_party/nnlib/libhexagon_controller.so",
+                      self.data_dir)
+
+            mace_model_phone_path = ""
+            if model_graph_format == ModelFormat.file:
+                mace_model_phone_path = "%s/%s.pb" % (self.data_dir,
+                                                      model_tag)
+                self.push(mace_model_path,
+                          mace_model_phone_path)
+            if link_dynamic:
+                self.push(libmace_dynamic_library_path, self.data_dir)
+            self.push("%s/%s" % (target_dir, target_name), self.data_dir)
+
+            stdout_buff = []
+            process_output = sh_commands.make_output_processor(stdout_buff)
+            cmd = [
+                "LD_LIBRARY_PATH=%s" % self.data_dir,
+                "MACE_TUNING=%s" % int(tuning),
+                "MACE_OUT_OF_RANGE_CHECK=%s" % int(out_of_range_check),
+                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
+                "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % self.data_dir,
+                "MACE_INTERNAL_STORAGE_PATH=%s" % internal_storage_dir,
+                "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" % limit_opencl_kernel_time,
+                "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
+            ]
+            if self.system == SystemType.android and address_sanitizer:
+                cmd.extend([
+                    "LD_PRELOAD=%s/%s" %
+                    (self.data_dir,
+                     sh_commands.asan_rt_library_names(abi))
+                ])
+            cmd.extend([
+                "%s/%s" % (self.data_dir, target_name),
+                "--model_name=%s" % model_tag,
+                "--input_node=%s" % ",".join(input_nodes),
+                "--output_node=%s" % ",".join(output_nodes),
+                "--input_shape=%s" % ":".join(input_shapes),
+                "--output_shape=%s" % ":".join(output_shapes),
+                "--input_file=%s/%s" % (self.data_dir, input_file_name),
+                "--output_file=%s/%s" % (self.data_dir, output_file_name),
+                "--model_data_file=%s/%s.data" % (self.data_dir, model_tag),
+                "--device=%s" % device_type,
+                "--round=%s" % running_round,
+                "--restart_round=%s" % restart_round,
+                "--omp_num_threads=%s" % omp_num_threads,
+                "--cpu_affinity_policy=%s" % cpu_affinity_policy,
+                "--gpu_perf_hint=%s" % gpu_perf_hint,
+                "--gpu_priority_hint=%s" % gpu_priority_hint,
+                "--model_file=%s" % mace_model_phone_path,
+                "--opencl_binary_file=%s/%s" %
+                (self.data_dir, os.path.basename(opencl_binary_file)),
+                "--opencl_parameter_file=%s/%s" %
+                (self.data_dir, os.path.basename(opencl_parameter_file)),
+            ])
+            cmd = ' '.join(cmd)
+            cmd_file_name = "%s-%s-%s" % ('cmd_file',
+                                          model_tag,
+                                          str(time.time()))
+            cmd_file = "%s/%s" % (self.data_dir, cmd_file_name)
+            tmp_cmd_file = "%s/%s" % ('/tmp', cmd_file_name)
+            with open(tmp_cmd_file, 'w') as file:
+                file.write(cmd)
+            self.push(tmp_cmd_file, cmd_file)
+            os.remove(tmp_cmd_file)
+            self.exec_command('sh {}'.format(cmd_file),
+                              _tty_in=True,
+                              _out=process_output,
+                              _err_to_out=True)
+            self.stdout = "".join(stdout_buff)
+            if not sh_commands.stdout_success(self.stdout):
+                common.MaceLogger.error("Mace Run", "Mace run failed.")
+
+            six.print_("Running finished!\n")
+        else:
+            six.print_('Unsupported system %s' % self.system, file=sys.stderr)
+            raise Exception('Wrong device')
+
+        return self.stdout
+
+    def tuning(self, library_name, model_name, model_config,
+               model_graph_format, model_data_format,
+               target_abi, mace_lib_type):
+        six.print_('* Tuning, it may take some time')
+        build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
+        mace_run_name = MACE_RUN_STATIC_NAME
+        link_dynamic = False
+        if mace_lib_type == MACELibType.dynamic:
+            mace_run_name = MACE_RUN_DYNAMIC_NAME
+            link_dynamic = True
+        embed_model_data = model_data_format == ModelFormat.code
+
+        # build for specified soc
+        # device_wrapper = DeviceWrapper(device)
+
+        model_output_base_dir, model_output_dir, mace_model_dir = \
+            get_build_model_dirs(
+                library_name, model_name, target_abi, self,
+                model_config[YAMLKeyword.model_file_path])
+
+        self.clear_data_dir()
+
+        subgraphs = model_config[YAMLKeyword.subgraphs]
+        # generate input data
+        sh_commands.gen_random_input(
+            model_output_dir,
+            subgraphs[0][YAMLKeyword.input_tensors],
+            subgraphs[0][YAMLKeyword.input_shapes],
+            subgraphs[0][YAMLKeyword.validation_inputs_data],
+            input_ranges=subgraphs[0][YAMLKeyword.input_ranges],
+            input_data_types=subgraphs[0][YAMLKeyword.input_data_types]
+        )
+
+        self.tuning_run(
+            abi=target_abi,
+            target_dir=build_tmp_binary_dir,
+            target_name=mace_run_name,
+            vlog_level=0,
+            embed_model_data=embed_model_data,
+            model_output_dir=model_output_dir,
+            input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
+            output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
+            input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
+            output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
+            mace_model_dir=mace_model_dir,
+            model_tag=model_name,
+            device_type=DeviceType.GPU,
+            running_round=0,
+            restart_round=1,
+            limit_opencl_kernel_time=model_config[
+                YAMLKeyword.limit_opencl_kernel_time],
+            tuning=True,
+            out_of_range_check=False,
+            model_graph_format=model_graph_format,
+            opencl_binary_file='',
+            opencl_parameter_file='',
+            libmace_dynamic_library_path=LIBMACE_DYNAMIC_PATH,
+            link_dynamic=link_dynamic,
+        )
+
+        # pull opencl library
+        self.pull(self.interior_dir, CL_COMPILED_BINARY_FILE_NAME,
+                  '{}/{}'.format(model_output_dir,
+                                 BUILD_TMP_OPENCL_BIN_DIR))
+
+        # pull opencl parameter
+        self.pull_from_data_dir(CL_TUNED_PARAMETER_FILE_NAME,
+                                '{}/{}'.format(model_output_dir,
+                                               BUILD_TMP_OPENCL_BIN_DIR))
+
+        six.print_('Tuning done! \n')
+
+    def run_specify_abi(self, flags, configs, target_abi):
+        if target_abi not in self.target_abis:
+            six.print_('There is no device with soc: %s abi: %s' %
+                       (self.target_socs, target_abi))
+            return
+        library_name = configs[YAMLKeyword.library_name]
+        mace_lib_type = flags.mace_lib_type
+        embed_model_data = \
+            configs[YAMLKeyword.model_data_format] == ModelFormat.code
+        build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
+
+        # get target name for run
+        if flags.example:
+            if mace_lib_type == MACELibType.static:
+                target_name = EXAMPLE_STATIC_NAME
+            else:
+                target_name = EXAMPLE_DYNAMIC_NAME
+        else:
+            if mace_lib_type == MACELibType.static:
+                target_name = MACE_RUN_STATIC_NAME
+            else:
+                target_name = MACE_RUN_DYNAMIC_NAME
+        link_dynamic = mace_lib_type == MACELibType.dynamic
+        model_output_dirs = []
+
+        for model_name in configs[YAMLKeyword.models]:
+            check_model_converted(library_name, model_name,
+                                  configs[YAMLKeyword.model_graph_format],
+                                  configs[YAMLKeyword.model_data_format],
+                                  target_abi)
+            if target_abi == ABIType.host:
+                device_model = ABIType.host
+            else:
+                device_model = self.models
+                self.clear_data_dir()
+            MaceLogger.header(
+                StringFormatter.block(
+                    'Run model {} on {}'.format(model_name, device_model)))
+
+            model_config = configs[YAMLKeyword.models][model_name]
+            model_runtime = model_config[YAMLKeyword.runtime]
+            subgraphs = model_config[YAMLKeyword.subgraphs]
+
+            if not configs[YAMLKeyword.target_socs] \
+                    or target_abi == ABIType.host:
+                model_output_base_dir, model_output_dir, mace_model_dir = \
+                    get_build_model_dirs(
+                        library_name, model_name, target_abi, self,
+                        model_config[YAMLKeyword.model_file_path])
+            else:
+                model_output_base_dir, model_output_dir, mace_model_dir = \
+                    get_build_model_dirs(
+                        library_name, model_name, target_abi, self,
+                        model_config[YAMLKeyword.model_file_path])
+
+            # clear temp model output dir
+            if os.path.exists(model_output_dir):
+                sh.rm('-rf', model_output_dir)
+            os.makedirs(model_output_dir)
+
+            is_tuned = False
+            model_opencl_output_bin_path = ''
+            model_opencl_parameter_path = ''
+            if not flags.address_sanitizer \
+                    and not flags.example \
+                    and target_abi != ABIType.host \
+                    and configs[YAMLKeyword.target_socs] \
+                    and self.target_socs \
+                    and model_runtime in [RuntimeType.gpu,
+                                          RuntimeType.cpu_gpu] \
+                    and not flags.disable_tuning:
+                self.tuning(library_name, model_name, model_config,
+                            configs[YAMLKeyword.model_graph_format],
+                            configs[YAMLKeyword.model_data_format],
+                            target_abi, mace_lib_type)
+                model_output_dirs.append(model_output_dir)
+                model_opencl_output_bin_path = \
+                    '{}/{}/{}'.format(model_output_dir,
+                                      BUILD_TMP_OPENCL_BIN_DIR,
+                                      CL_COMPILED_BINARY_FILE_NAME)
+                model_opencl_parameter_path = \
+                    '{}/{}/{}'.format(model_output_dir,
+                                      BUILD_TMP_OPENCL_BIN_DIR,
+                                      CL_TUNED_PARAMETER_FILE_NAME)
+                self.clear_data_dir()
+                is_tuned = True
+            elif target_abi != ABIType.host and self.target_socs:
+                model_opencl_output_bin_path = get_opencl_binary_output_path(
+                    library_name, target_abi, self
+                )
+                model_opencl_parameter_path = get_opencl_parameter_output_path(
+                    library_name, target_abi, self
+                )
+            sh_commands.gen_random_input(
+                model_output_dir,
+                subgraphs[0][YAMLKeyword.input_tensors],
+                subgraphs[0][YAMLKeyword.input_shapes],
+                subgraphs[0][YAMLKeyword.validation_inputs_data],
+                input_ranges=subgraphs[0][YAMLKeyword.input_ranges],
+                input_data_types=subgraphs[0][YAMLKeyword.input_data_types]
+            )
+            runtime_list = []
+            if target_abi == ABIType.host:
+                runtime_list.append(RuntimeType.cpu)
+            elif model_runtime == RuntimeType.cpu_gpu:
+                runtime_list.extend([RuntimeType.cpu, RuntimeType.gpu])
+            else:
+                runtime_list.append(model_runtime)
+            for runtime in runtime_list:
+                device_type = parse_device_type(runtime)
+                # run for specified soc
+                run_output = self.tuning_run(
+                    abi=target_abi,
+                    target_dir=build_tmp_binary_dir,
+                    target_name=target_name,
+                    vlog_level=flags.vlog_level,
+                    embed_model_data=embed_model_data,
+                    model_output_dir=model_output_dir,
+                    input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
+                    output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
+                    input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
+                    output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
+                    mace_model_dir=mace_model_dir,
+                    model_tag=model_name,
+                    device_type=device_type,
+                    running_round=flags.round,
+                    restart_round=flags.restart_round,
+                    limit_opencl_kernel_time=model_config[
+                        YAMLKeyword.limit_opencl_kernel_time],
+                    tuning=False,
+                    out_of_range_check=flags.gpu_out_of_range_check,
+                    model_graph_format=configs[YAMLKeyword.model_graph_format],
+                    omp_num_threads=flags.omp_num_threads,
+                    cpu_affinity_policy=flags.cpu_affinity_policy,
+                    gpu_perf_hint=flags.gpu_perf_hint,
+                    gpu_priority_hint=flags.gpu_priority_hint,
+                    runtime_failure_ratio=flags.runtime_failure_ratio,
+                    address_sanitizer=flags.address_sanitizer,
+                    opencl_binary_file=model_opencl_output_bin_path,
+                    opencl_parameter_file=model_opencl_parameter_path,
+                    libmace_dynamic_library_path=LIBMACE_DYNAMIC_PATH,
+                    link_dynamic=link_dynamic
+                )
+                if flags.validate:
+                    model_file_path, weight_file_path = get_model_files(
+                        model_config[YAMLKeyword.model_file_path],
+                        model_config[YAMLKeyword.model_sha256_checksum],
+                        BUILD_DOWNLOADS_DIR,
+                        model_config[YAMLKeyword.weight_file_path],
+                        model_config[YAMLKeyword.weight_sha256_checksum]
+                    )
+
+                    validate_type = device_type
+                    if model_config[YAMLKeyword.quantize] == 1:
+                        validate_type = device_type + '_QUANTIZE'
+                    sh_commands.validate_model(
+                        abi=target_abi,
+                        device=self,
+                        model_file_path=model_file_path,
+                        weight_file_path=weight_file_path,
+                        platform=model_config[YAMLKeyword.platform],
+                        device_type=device_type,
+                        input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
+                        output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
+                        input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
+                        output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
+                        model_output_dir=model_output_dir,
+                        input_data_types=subgraphs[0][
+                            YAMLKeyword.input_data_types],
+                        caffe_env=flags.caffe_env,
+                        validation_threshold=subgraphs[0][
+                            YAMLKeyword.validation_threshold][validate_type]
+                    )
+                if flags.report and flags.round > 0:
+                    tuned = is_tuned and device_type == DeviceType.GPU
+                    self.report_run_statistics(
+                        target_abi=target_abi,
+                        model_name=model_name,
+                        device_type=device_type,
+                        output_dir=flags.report_dir,
+                        tuned=tuned
+                    )
+        if model_output_dirs:
+            opencl_output_bin_path = get_opencl_binary_output_path(
+                library_name, target_abi, self
+            )
+            opencl_parameter_bin_path = get_opencl_parameter_output_path(
+                library_name, target_abi, self
+            )
+
+            # clear opencl output dir
+            if os.path.exists(opencl_output_bin_path):
+                sh.rm('-rf', opencl_output_bin_path)
+            if os.path.exists(opencl_parameter_bin_path):
+                sh.rm('-rf', opencl_parameter_bin_path)
+
+            # merge all model's opencl binaries together
+            sh_commands.merge_opencl_binaries(
+                model_output_dirs, CL_COMPILED_BINARY_FILE_NAME,
+                opencl_output_bin_path
+            )
+            # merge all model's opencl parameter together
+            sh_commands.merge_opencl_parameters(
+                model_output_dirs, CL_TUNED_PARAMETER_FILE_NAME,
+                opencl_parameter_bin_path
+            )
+
+    def report_run_statistics(self,
+                              target_abi,
+                              model_name,
+                              device_type,
+                              output_dir,
+                              tuned):
+        metrics = [0] * 3
+        for line in self.stdout.split('\n'):
+            line = line.strip()
+            parts = line.split()
+            if len(parts) == 4 and parts[0].startswith('time'):
+                metrics[0] = str(float(parts[1]))
+                metrics[1] = str(float(parts[2]))
+                metrics[2] = str(float(parts[3]))
+                break
+        report_filename = output_dir + '/report.csv'
+        if not os.path.exists(report_filename):
+            with open(report_filename, 'w') as f:
+                f.write('model_name,device_name,soc,abi,runtime,'
+                        'init(ms),warmup(ms),run_avg(ms),tuned\n')
+
+        data_str = '{model_name},{device_name},{soc},{abi},{device_type},' \
+                   '{init},{warmup},{run_avg},{tuned}\n'.format(
+                    model_name=model_name,
+                    device_name=self.models,
+                    soc=self.target_socs,
+                    abi=target_abi,
+                    device_type=device_type,
+                    init=metrics[0],
+                    warmup=metrics[1],
+                    run_avg=metrics[2],
+                    tuned=tuned)
+        with open(report_filename, 'a') as f:
+            f.write(data_str)
+
+    def benchmark_model(self,
+                        abi,
+                        benchmark_binary_dir,
+                        benchmark_binary_name,
+                        vlog_level,
+                        embed_model_data,
+                        model_output_dir,
+                        mace_model_dir,
+                        input_nodes,
+                        output_nodes,
+                        input_shapes,
+                        output_shapes,
+                        model_tag,
+                        device_type,
+                        model_graph_format,
+                        opencl_binary_file,
+                        opencl_parameter_file,
+                        libmace_dynamic_library_path,
+                        omp_num_threads=-1,
+                        cpu_affinity_policy=1,
+                        gpu_perf_hint=3,
+                        gpu_priority_hint=3,
+                        input_file_name='model_input',
+                        link_dynamic=False):
+        six.print_('* Benchmark for %s' % model_tag)
+
+        mace_model_path = ''
+        if model_graph_format == ModelFormat.file:
+            mace_model_path = '%s/%s.pb' % (mace_model_dir, model_tag)
+        if abi == 'host':
+            libmace_dynamic_lib_dir_path = \
+                os.path.dirname(libmace_dynamic_library_path)
+            p = subprocess.Popen(
+                [
+                    'env',
+                    'LD_LIBRARY_PATH=%s' % libmace_dynamic_lib_dir_path,
+                    'MACE_CPP_MIN_VLOG_LEVEL=%s' % vlog_level,
+                    '%s/%s' % (benchmark_binary_dir, benchmark_binary_name),
+                    '--model_name=%s' % model_tag,
+                    '--input_node=%s' % ','.join(input_nodes),
+                    '--output_node=%s' % ','.join(output_nodes),
+                    '--input_shape=%s' % ':'.join(input_shapes),
+                    '--output_shapes=%s' % ':'.join(output_shapes),
+                    '--input_file=%s/%s' % (model_output_dir, input_file_name),
+                    '--model_data_file=%s/%s.data' % (mace_model_dir,
+                                                      model_tag),
+                    '--device=%s' % device_type,
+                    '--omp_num_threads=%s' % omp_num_threads,
+                    '--cpu_addinity_policy=%s' % cpu_affinity_policy,
+                    '--gpu_perf_hint=%s' % gpu_perf_hint,
+                    '--gpu_priority_hint=%s' % gpu_priority_hint,
+                    '--model_file=%s' % mace_model_path
+                ])
+            p.wait()
+        elif self.system in [SystemType.android, SystemType.arm_linux]:
+            self.exec_command('mkdir -p %s' % self.data_dir)
+            internal_storage_dir = self.create_internal_storage_dir()
+            for input_name in input_nodes:
+                formatted_name = formatted_file_name(input_file_name,
+                                                     input_name)
+                self.push('%s/%s' % (model_output_dir, formatted_name),
+                          self.data_dir)
+            if not embed_model_data:
+                self.push('%s/%s.data' % (mace_model_dir, model_tag),
+                          self.data_dir)
+            if device_type == common.DeviceType.GPU:
+                if os.path.exists(opencl_binary_file):
+                    self.push(opencl_binary_file, self.data_dir)
+                if os.path.exists(opencl_parameter_file):
+                    self.push(opencl_parameter_file, self.data_dir)
+            mace_model_device_path = ''
+            if model_graph_format == ModelFormat.file:
+                mace_model_device_path = '%s/%s.pb' % \
+                                         (self.data_dir, model_tag)
+                self.push(mace_model_path, mace_model_device_path)
+            if link_dynamic:
+                self.push(libmace_dynamic_library_path, self.data_dir)
+            self.rm('%s/%s' % (self.data_dir, benchmark_binary_name))
+            self.push('%s/%s' % (benchmark_binary_dir, benchmark_binary_name),
+                      self.data_dir)
+
+            cmd = [
+                'LD_LIBRARY_PATH=%s' % self.data_dir,
+                'MACE_CPP_MIN_VLOG_LEVEL=%s' % vlog_level,
+                'MACE_RUN_PARAMETER_PATH=%s/mace_run.config' % self.data_dir,
+                'MACE_INTERNAL_STORAGE_PATH=%s' % internal_storage_dir,
+                'MACE_OPENCL_PROFILING=1',
+                '%s/%s' % (self.data_dir, benchmark_binary_name),
+                '--model_name=%s' % model_tag,
+                '--input_node=%s' % ','.join(input_nodes),
+                '--output_node=%s' % ','.join(output_nodes),
+                '--input_shape=%s' % ':'.join(input_shapes),
+                '--output_shape=%s' % ':'.join(output_shapes),
+                '--input_file=%s/%s' % (self.data_dir, input_file_name),
+                '--model_data_file=%s/%s.data' % (self.data_dir, model_tag),
+                '--device=%s' % device_type,
+                '--omp_num_threads=%s' % omp_num_threads,
+                '--cpu_affinity_policy=%s' % cpu_affinity_policy,
+                '--gpu_perf_hint=%s' % gpu_perf_hint,
+                '--gpu_priority_hint=%s' % gpu_priority_hint,
+                '--model_file=%s' % mace_model_device_path,
+                '--opencl_binary_file=%s/%s' %
+                (self.data_dir, os.path.basename(opencl_binary_file)),
+                '--opencl_parameter_file=%s/%s' %
+                (self.data_dir, os.path.basename(opencl_parameter_file))
+            ]
+
+            cmd = ' '.join(cmd)
+            cmd_file_name = '%s-%s-%s' % \
+                            ('cmd_file', model_tag, str(time.time()))
+
+            cmd_file_path = '%s/%s' % (self.data_dir, cmd_file_name)
+            tmp_cmd_file = '%s/%s' % ('/tmp', cmd_file_name)
+            with open(tmp_cmd_file, 'w') as f:
+                f.write(cmd)
+            self.push(tmp_cmd_file, cmd_file_path)
+            os.remove(tmp_cmd_file)
+
+            if self.system == SystemType.android:
+                sh.adb(
+                    '-s',
+                    self.address,
+                    'shell',
+                    'sh',
+                    cmd_file_path,
+                    _fg=True
+                )
+            elif self.system == SystemType.arm_linux:
+                sh.ssh('%s@%s' % (self.username, self.address),
+                       'sh',
+                       cmd_file_path,
+                       _fg=True)
+            self.rm(cmd_file_path)
+            six.print_('Benchmark done! \n')
+
+    def bm_specific_target(self, flags, configs, target_abi):
+        library_name = configs[YAMLKeyword.library_name]
+        embed_model_data = \
+            configs[YAMLKeyword.model_data_format] == ModelFormat.code
+        opencl_output_bin_path = ''
+        opencl_parameter_path = ''
+        link_dynamic = flags.mace_lib_type == MACELibType.dynamic
+
+        if link_dynamic:
+            bm_model_binary_name = BM_MODEL_DYNAMIC_NAME
+        else:
+            bm_model_binary_name = BM_MODEL_STATIC_NAME
+        build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
+        if configs[YAMLKeyword.target_socs] and target_abi != ABIType.host:
+            opencl_output_bin_path = get_opencl_binary_output_path(
+                library_name, target_abi, self
+            )
+            opencl_parameter_path = get_opencl_parameter_output_path(
+                library_name, target_abi, self
+            )
+
+        for model_name in configs[YAMLKeyword.models]:
+            check_model_converted(library_name,
+                                  model_name,
+                                  configs[YAMLKeyword.model_graph_format],
+                                  configs[YAMLKeyword.model_data_format],
+                                  target_abi)
+            if target_abi == ABIType.host:
+                device_name = ABIType.host
+            else:
+                device_name = self.models
+            MaceLogger.header(
+                StringFormatter.block(
+                    'Benchmark model %s on %s' % (model_name, device_name)))
+            model_config = configs[YAMLKeyword.models][model_name]
+            model_runtime = model_config[YAMLKeyword.runtime]
+            subgraphs = model_config[YAMLKeyword.subgraphs]
+
+            model_output_base_dir, model_output_dir, mace_model_dir = \
+                get_build_model_dirs(library_name, model_name,
+                                     target_abi, self,
+                                     model_config[YAMLKeyword.model_file_path])
+            if os.path.exists(model_output_dir):
+                sh.rm('-rf', model_output_dir)
+            os.makedirs(model_output_dir)
+
+            if target_abi != ABIType.host:
+                self.clear_data_dir()
+            sh_commands.gen_random_input(
+                model_output_dir,
+                subgraphs[0][YAMLKeyword.input_tensors],
+                subgraphs[0][YAMLKeyword.input_shapes],
+                subgraphs[0][YAMLKeyword.validation_inputs_data],
+                input_ranges=subgraphs[0][YAMLKeyword.input_ranges],
+                input_data_types=subgraphs[0][YAMLKeyword.input_data_types]
+            )
+            runtime_list = []
+            if target_abi == ABIType.host:
+                runtime_list.append(RuntimeType.cpu)
+            elif model_runtime == RuntimeType.cpu_gpu:
+                runtime_list.extend([RuntimeType.cpu, RuntimeType.cpu_gpu])
+            else:
+                runtime_list.append(model_runtime)
+            for runtime in runtime_list:
+                device_type = parse_device_type(runtime)
+                self.benchmark_model(
+                    abi=target_abi,
+                    benchmark_binary_dir=build_tmp_binary_dir,
+                    benchmark_binary_name=bm_model_binary_name,
+                    vlog_level=0,
+                    embed_model_data=embed_model_data,
+                    model_output_dir=model_output_dir,
+                    input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
+                    output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
+                    input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
+                    output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
+                    mace_model_dir=mace_model_dir,
+                    model_tag=model_name,
+                    device_type=device_type,
+                    model_graph_format=configs[YAMLKeyword.model_graph_format],
+                    omp_num_threads=flags.omp_num_threads,
+                    cpu_affinity_policy=flags.cpu_affinity_policy,
+                    gpu_perf_hint=flags.gpu_perf_hint,
+                    gpu_priority_hint=flags.gpu_priority_hint,
+                    opencl_binary_file=opencl_output_bin_path,
+                    opencl_parameter_file=opencl_parameter_path,
+                    libmace_dynamic_library_path=LIBMACE_DYNAMIC_PATH,
+                    link_dynamic=link_dynamic
+                )
+
+    def run(self,
+            abi,
+            host_bin_path,
+            bin_name,
+            args='',
+            opencl_profiling=True,
+            vlog_level=0,
+            out_of_range_check=True,
+            address_sanitizer=False,
+            simpleperf=False):
+        host_bin_full_path = '%s/%s' % (host_bin_path, bin_name)
+        device_bin_full_path = '%s/%s' % (self.data_dir, bin_name)
+        print(
+            '================================================================'
+        )
+        print('Trying to lock device %s' % self.address)
+        with self.lock():
+            print('Run on device: %s, %s, %s' %
+                  (self.address, self.target_socs, self.models))
+            self.rm(self.data_dir)
+            self.exec_command('mkdir -p %s' % self.data_dir)
+            self.push(host_bin_full_path, device_bin_full_path)
+            ld_preload = ''
+            if address_sanitizer:
+                self.push(sh_commands.find_asan_rt_library(abi),
+                          self.data_dir)
+                ld_preload = 'LD_PRELOAD=%s/%s' % \
+                             (self.data_dir,
+                              sh_commands.asan_rt_library_names(abi))
+            opencl_profiling = 1 if opencl_profiling else 0
+            out_of_range_check = 1 if out_of_range_check else 0
+            print('Run %s' % device_bin_full_path)
+            stdout_buf = []
+            process_output = sh_commands.make_output_processor(stdout_buf)
+
+            if simpleperf and self.system == SystemType.android:
+                self.push(sh_commands.find_simpleperf_library(abi),
+                          self.data_dir)
+                simpleperf_cmd = '%s/simpleperf' % self.data_dir
+                exec_cmd = [
+                    ld_preload,
+                    'MACE_OUT_OF_RANGE_CHECK=%s' % out_of_range_check,
+                    'MACE_OPENCL_PROFILING=%d' % opencl_profiling,
+                    'MACE_CPP_MIN_VLOG_LEVEL=%d' % vlog_level,
+                    simpleperf_cmd,
+                    'stat',
+                    '--group',
+                    'raw-l1-dcache,raw-l1-dcache-refill',
+                    '--group',
+                    'raw-l2-dcache,raw-l2-dcache-refill',
+                    '--group',
+                    'raw-l1-dtlb,raw-l1-dtlb-refill',
+                    '--group',
+                    'raw-l2-dtlb,raw-l2-dtlb-refill',
+                    device_bin_full_path,
+                    args,
+                ]
+            else:
+                exec_cmd = [
+                    ld_preload,
+                    'MACE_OUT_OF_RANGE_CHECK=%d' % out_of_range_check,
+                    'MACE_OPENCL_PROFILNG=%d' % opencl_profiling,
+                    'MACE_CPP_MIN_VLOG_LEVEL=%d' % vlog_level,
+                    device_bin_full_path,
+                    args
+                ]
+            exec_cmd = ' '.join(exec_cmd)
+            self.exec_command(exec_cmd, _tty_in=True,
+                              _out=process_output, _err_to_out=True)
+            return ''.join(stdout_buf)
+
+
+class DeviceManager:
+    @classmethod
+    def list_adb_device(cls):
+        adb_list = sh.adb('devices').stdout.decode('utf-8'). \
+                       strip().split('\n')[1:]
+        adb_list = [tuple(pair.split('\t')) for pair in adb_list]
+        devices = []
+        for adb in adb_list:
+            prop = sh_commands.adb_getprop_by_serialno(adb[0])
+            android = {
+                YAMLKeyword.device_name: adb[1],
+                YAMLKeyword.target_abis:
+                    prop['ro.product.cpu.abilist'].split(','),
+                YAMLKeyword.target_socs: prop['ro.board.platform'],
+                YAMLKeyword.models: prop['ro.product.model'].replace(' ', '_'),
+                YAMLKeyword.system: SystemType.android,
+                YAMLKeyword.address: adb[0],
+                YAMLKeyword.username: '',
+            }
+            devices.append(android)
+        return devices
+
+    @classmethod
+    def list_ssh_device(cls, yml):
+        with open(yml) as f:
+            devices = yaml.load(f.read())
+        devices = devices['devices']
+        device_list = []
+        for name, dev in six.iteritems(devices):
+            dev[YAMLKeyword.device_name] = name
+            dev[YAMLKeyword.system] = SystemType.arm_linux
+            dev[YAMLKeyword.models] = dev[YAMLKeyword.models].replace(' ', '_')
+            device_list.append(dev)
+        return device_list
+
+    @classmethod
+    def list_devices(cls, yml):
+        devices_list = []
+        devices_list.extend(cls.list_adb_device())
+        if not yml:
+            if os.path.exists('devices.yml'):
+                devices_list.extend(cls.list_ssh_device('devices.yml'))
+        else:
+            if os.path.exists(yml):
+                devices_list.extend(cls.list_ssh_device(yml))
+            else:
+                MaceLogger.error(ModuleName.RUN,
+                                 'no ARM linux device config file found')
+        host = {
+            YAMLKeyword.device_name: SystemType.host,
+            YAMLKeyword.target_abis: [ABIType.host],
+            YAMLKeyword.target_socs: '',
+            YAMLKeyword.system: SystemType.host,
+            YAMLKeyword.models: None,
+            YAMLKeyword.address: None,
+
+        }
+        devices_list.append(host)
+        return devices_list
+
+
+if __name__ == '__main__':
+    pass
diff --git a/tools/image/image_to_tensor.py b/tools/image/image_to_tensor.py
index d39c07c3..8dabe6db 100644
--- a/tools/image/image_to_tensor.py
+++ b/tools/image/image_to_tensor.py
@@ -1,6 +1,9 @@
 import argparse
 import os
 import sys
+
+import six
+
 import tensorflow as tf
 
 # TODO(liyin): use dataset api and estimator with distributed strategy
@@ -70,7 +73,7 @@ def images_to_tensors(input_files, image_shape, mean_values=None):
 
 def main(unused_args):
     if not os.path.exists(FLAGS.input):
-        print ("input does not exist: %s" % FLAGS.input)
+        print("input does not exist: %s" % FLAGS.input)
         sys.exit(-1)
 
     input_files = []
diff --git a/tools/image/tensor_to_image.py b/tools/image/tensor_to_image.py
index ce18628e..e04dde7e 100644
--- a/tools/image/tensor_to_image.py
+++ b/tools/image/tensor_to_image.py
@@ -1,6 +1,9 @@
 import argparse
 import os
 import sys
+
+import six
+
 import numpy as np
 import tensorflow as tf
 
@@ -53,7 +56,7 @@ def tensors_to_images(input_files, image_shape):
 
 def main(unused_args):
     if not os.path.exists(FLAGS.input):
-        print ("input does not exist: %s" % FLAGS.input)
+        print("input does not exist: %s" % FLAGS.input)
         sys.exit(-1)
 
     input_files = []
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 601f5b2c..38f2b60a 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -23,13 +23,16 @@ import struct
 import subprocess
 import sys
 import time
-import urllib
 import platform
-from enum import Enum
 
 import six
 
 import common
+from common import ModelFormat
+from common import ABIType
+from common import SystemType
+from common import YAMLKeyword
+from common import abi_to_internal
 
 sys.path.insert(0, "mace/python/tools")
 try:
@@ -89,11 +92,6 @@ class BuildType(object):
     code = 'code'
 
 
-class ModelFormat(object):
-    file = 'file'
-    code = 'code'
-
-
 def stdout_success(stdout):
     stdout_lines = stdout.split("\n")
     for line in stdout_lines:
@@ -190,7 +188,7 @@ def adb_pull(src_path, dst_path, serialno):
     try:
         sh.adb("-s", serialno, "pull", src_path, dst_path)
     except Exception as e:
-        six.print_("Error msg: %s" % e.stderr)
+        six.print_("Error msg: %s" % e, file=sys.stderr)
 
 
 def adb_run(abi,
@@ -293,7 +291,7 @@ def find_asan_rt_library(abi, asan_rt_path=''):
         if len(candidates) == 0:
             common.MaceLogger.error(
                 "Toolchain",
-                "Can't find AddressSanitizer runtime library in % s" %
+                "Can't find AddressSanitizer runtime library in %s" %
                 find_path)
         elif len(candidates) > 1:
             common.MaceLogger.info(
@@ -338,6 +336,7 @@ def find_simpleperf_library(abi, simpleperf_path=''):
 ################################
 def bazel_build(target,
                 abi="armeabi-v7a",
+                toolchain='android',
                 hexagon_mode=False,
                 enable_openmp=True,
                 enable_neon=True,
@@ -361,8 +360,8 @@ def bazel_build(target,
             "build",
             target,
             "--config",
-            "android",
-            "--cpu=%s" % abi,
+            toolchain,
+            "--cpu=%s" % abi_to_internal(abi),
             "--define",
             "neon=%s" % str(enable_neon).lower(),
             "--define",
@@ -694,230 +693,20 @@ def push_depended_so_libs(libmace_dynamic_library_path,
     for dep in split_stdout(dep_so_libs):
         if dep == "libgnustl_shared.so":
             adb_push(
-                    "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/%s/libgnustl_shared.so"  # noqa
-                    % (os.environ["ANDROID_NDK_HOME"], abi),
-                    phone_data_dir,
-                    serialno)
+                "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/%s/libgnustl_shared.so"  # noqa
+                % (os.environ["ANDROID_NDK_HOME"], abi),
+                phone_data_dir,
+                serialno)
         elif dep == "libc++_shared.so":
             adb_push(
-                    "%s/sources/cxx-stl/llvm-libc++/libs/%s/libc++_shared.so"  # noqa
-                    % (os.environ["ANDROID_NDK_HOME"], abi),
-                    phone_data_dir,
-                    serialno)
-
-
-def tuning_run(abi,
-               serialno,
-               target_dir,
-               target_name,
-               vlog_level,
-               embed_model_data,
-               model_output_dir,
-               input_nodes,
-               output_nodes,
-               input_shapes,
-               output_shapes,
-               mace_model_dir,
-               model_tag,
-               device_type,
-               running_round,
-               restart_round,
-               limit_opencl_kernel_time,
-               tuning,
-               out_of_range_check,
-               phone_data_dir,
-               model_graph_format,
-               opencl_binary_file,
-               opencl_parameter_file,
-               libmace_dynamic_library_path,
-               omp_num_threads=-1,
-               cpu_affinity_policy=1,
-               gpu_perf_hint=3,
-               gpu_priority_hint=3,
-               input_file_name="model_input",
-               output_file_name="model_out",
-               input_dir="",
-               output_dir="",
-               runtime_failure_ratio=0.0,
-               address_sanitizer=False,
-               link_dynamic=False,
-               quantize_stat=False):
-    six.print_("* Run '%s' with round=%s, restart_round=%s, tuning=%s, "
-               "out_of_range_check=%s, omp_num_threads=%s, "
-               "cpu_affinity_policy=%s, gpu_perf_hint=%s, "
-               "gpu_priority_hint=%s" %
-               (model_tag, running_round, restart_round, str(tuning),
-                str(out_of_range_check), omp_num_threads, cpu_affinity_policy,
-                gpu_perf_hint, gpu_priority_hint))
-    sys.stdout.flush()
-
-    mace_model_path = ""
-    if model_graph_format == ModelFormat.file:
-        mace_model_path = "%s/%s.pb" % (mace_model_dir, model_tag)
-    if abi == "host":
-        libmace_dynamic_lib_path = \
-            os.path.dirname(libmace_dynamic_library_path)
-        cmd = [
-            "env",
-            "LD_LIBRARY_PATH=%s" % libmace_dynamic_lib_path,
-            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
-            "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
-        ]
-        if quantize_stat:
-            cmd.append("MACE_LOG_TENSOR_RANGE=1")
-        cmd.extend([
-            "%s/%s" % (target_dir, target_name),
-            "--model_name=%s" % model_tag,
-            "--input_node=%s" % ",".join(input_nodes),
-            "--output_node=%s" % ",".join(output_nodes),
-            "--input_shape=%s" % ":".join(input_shapes),
-            "--output_shape=%s" % ":".join(output_shapes),
-            "--input_file=%s/%s" % (model_output_dir, input_file_name),
-            "--output_file=%s/%s" % (model_output_dir, output_file_name),
-            "--input_dir=%s" % input_dir,
-            "--output_dir=%s" % output_dir,
-            "--model_data_file=%s/%s.data" % (mace_model_dir, model_tag),
-            "--device=%s" % device_type,
-            "--round=%s" % running_round,
-            "--restart_round=%s" % restart_round,
-            "--omp_num_threads=%s" % omp_num_threads,
-            "--cpu_affinity_policy=%s" % cpu_affinity_policy,
-            "--gpu_perf_hint=%s" % gpu_perf_hint,
-            "--gpu_priority_hint=%s" % gpu_priority_hint,
-            "--model_file=%s" % mace_model_path,
-        ])
-        p = subprocess.Popen(
-            cmd,
-            stderr=subprocess.PIPE,
-            stdout=subprocess.PIPE)
-        out, err = p.communicate()
-        stdout = err + out
-        six.print_(stdout)
-        six.print_("Running finished!\n")
-    else:
-        sh.adb("-s", serialno, "shell", "mkdir", "-p", phone_data_dir)
-        internal_storage_dir = create_internal_storage_dir(
-            serialno, phone_data_dir)
-
-        for input_name in input_nodes:
-            formatted_name = common.formatted_file_name(input_file_name,
-                                                        input_name)
-            adb_push("%s/%s" % (model_output_dir, formatted_name),
-                     phone_data_dir, serialno)
-        if address_sanitizer:
-            adb_push(find_asan_rt_library(abi), phone_data_dir, serialno)
-
-        if not embed_model_data:
-            adb_push("%s/%s.data" % (mace_model_dir, model_tag),
-                     phone_data_dir, serialno)
-
-        if device_type == common.DeviceType.GPU:
-            if os.path.exists(opencl_binary_file):
-                adb_push(opencl_binary_file, phone_data_dir, serialno)
-            if os.path.exists(opencl_parameter_file):
-                adb_push(opencl_parameter_file, phone_data_dir, serialno)
-
-        adb_push("third_party/nnlib/libhexagon_controller.so",
-                 phone_data_dir, serialno)
-
-        mace_model_phone_path = ""
-        if model_graph_format == ModelFormat.file:
-            mace_model_phone_path = "%s/%s.pb" % (phone_data_dir, model_tag)
-            adb_push(mace_model_path,
-                     mace_model_phone_path,
-                     serialno)
-
-        if link_dynamic:
-            adb_push(libmace_dynamic_library_path, phone_data_dir,
-                     serialno)
-            push_depended_so_libs(libmace_dynamic_library_path, abi,
-                                  phone_data_dir, serialno)
-
-        adb_push("%s/%s" % (target_dir, target_name), phone_data_dir,
-                 serialno)
-
-        stdout_buff = []
-        process_output = make_output_processor(stdout_buff)
-        adb_cmd = [
-            "LD_LIBRARY_PATH=%s" % phone_data_dir,
-            "MACE_TUNING=%s" % int(tuning),
-            "MACE_OUT_OF_RANGE_CHECK=%s" % int(out_of_range_check),
-            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
-            "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % phone_data_dir,
-            "MACE_INTERNAL_STORAGE_PATH=%s" % internal_storage_dir,
-            "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" % limit_opencl_kernel_time,
-            "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
-        ]
-        if quantize_stat:
-            adb_cmd.append("MACE_LOG_TENSOR_RANGE=1")
-        if address_sanitizer:
-            adb_cmd.extend([
-                "LD_PRELOAD=%s/%s" % (phone_data_dir,
-                                      asan_rt_library_names(abi))
-            ])
-        adb_cmd.extend([
-            "%s/%s" % (phone_data_dir, target_name),
-            "--model_name=%s" % model_tag,
-            "--input_node=%s" % ",".join(input_nodes),
-            "--output_node=%s" % ",".join(output_nodes),
-            "--input_shape=%s" % ":".join(input_shapes),
-            "--output_shape=%s" % ":".join(output_shapes),
-            "--input_file=%s/%s" % (phone_data_dir, input_file_name),
-            "--output_file=%s/%s" % (phone_data_dir, output_file_name),
-            "--input_dir=%s" % input_dir,
-            "--output_dir=%s" % output_dir,
-            "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag),
-            "--device=%s" % device_type,
-            "--round=%s" % running_round,
-            "--restart_round=%s" % restart_round,
-            "--omp_num_threads=%s" % omp_num_threads,
-            "--cpu_affinity_policy=%s" % cpu_affinity_policy,
-            "--gpu_perf_hint=%s" % gpu_perf_hint,
-            "--gpu_priority_hint=%s" % gpu_priority_hint,
-            "--model_file=%s" % mace_model_phone_path,
-            "--opencl_binary_file=%s/%s" %
-            (phone_data_dir, os.path.basename(opencl_binary_file)),
-            "--opencl_parameter_file=%s/%s" %
-            (phone_data_dir, os.path.basename(opencl_parameter_file)),
-        ])
-        adb_cmd = ' '.join(adb_cmd)
-        cmd_file_name = "%s-%s-%s" % ('cmd_file', model_tag, str(time.time()))
-        adb_cmd_file = "%s/%s" % (phone_data_dir, cmd_file_name)
-        tmp_cmd_file = "%s/%s" % ('/tmp', cmd_file_name)
-        with open(tmp_cmd_file, 'w') as cmd_file:
-            cmd_file.write(adb_cmd)
-        adb_push(tmp_cmd_file, adb_cmd_file, serialno)
-        os.remove(tmp_cmd_file)
-
-        sh.adb(
-            "-s",
-            serialno,
-            "shell",
-            "sh",
-            adb_cmd_file,
-            _tty_in=True,
-            _out=process_output,
-            _err_to_out=True)
-        stdout = "".join(stdout_buff)
-        if not stdout_success(stdout):
-            common.MaceLogger.error("Mace Run", "Mace run failed.")
-
-        sh.adb(
-            "-s",
-            serialno,
-            "shell",
-            "rm",
-            adb_cmd_file,
-            _fg=True)
-
-        six.print_("Running finished!\n")
-
-    sys.stdout.flush()
-    return stdout
+                "%s/sources/cxx-stl/llvm-libc++/libs/%s/libc++_shared.so"  # noqa
+                % (os.environ["ANDROID_NDK_HOME"], abi),
+                phone_data_dir,
+                serialno)
 
 
 def validate_model(abi,
-                   serialno,
+                   device,
                    model_file_path,
                    weight_file_path,
                    platform,
@@ -927,7 +716,6 @@ def validate_model(abi,
                    input_shapes,
                    output_shapes,
                    model_output_dir,
-                   phone_data_dir,
                    input_data_types,
                    caffe_env,
                    input_file_name="model_input",
@@ -941,8 +729,7 @@ def validate_model(abi,
             if os.path.exists("%s/%s" % (model_output_dir,
                                          formatted_name)):
                 sh.rm("-rf", "%s/%s" % (model_output_dir, formatted_name))
-            adb_pull("%s/%s" % (phone_data_dir, formatted_name),
-                     model_output_dir, serialno)
+            device.pull_from_data_dir(formatted_name, model_output_dir)
 
     if platform == "tensorflow":
         validate(platform, model_file_path, "",
@@ -956,11 +743,10 @@ def validate_model(abi,
         container_name = "mace_caffe_validator"
 
         if caffe_env == common.CaffeEnvType.LOCAL:
-            import imp
             try:
-                imp.find_module('caffe')
+                import caffe
             except ImportError:
-                logger.error('There is no caffe python module.')
+                logging.error('There is no caffe python module.')
             validate(platform, model_file_path, weight_file_path,
                      "%s/%s" % (model_output_dir, input_file_name),
                      "%s/%s" % (model_output_dir, output_file_name),
@@ -1157,8 +943,8 @@ def benchmark_model(abi,
         if link_dynamic:
             adb_push(libmace_dynamic_library_path, phone_data_dir,
                      serialno)
-            push_depended_so_lib(libmace_dynamic_library_path, abi,
-                                 phone_data_dir, serialno)
+            push_depended_so_libs(libmace_dynamic_library_path, abi,
+                                  phone_data_dir, serialno)
 
         adb_push("%s/%s" % (benchmark_binary_dir, benchmark_binary_name),
                  phone_data_dir,
-- 
GitLab