feature: support arm linux device

1. Abstact android and arm linux to one format 2. Support cross compilation for ARM linux 3. Related issue #36

feature: support arm linux device
1. Abstact android and arm linux to one format 2. Support cross compilation for ARM linux 3. Related issue #36
51b14100 · liuqi · 66cf184f · 51b14100 · 51b14100 · 51b14100
27 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -47,8 +47,13 @@ ops_test:
  stage: ops_test
  script:
    - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
-    - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
-    - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS --enable_neon=false
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
+    - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64,armhf --target_socs=$TARGET_SOCS
+    - python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64,armhf --target_socs=$TARGET_SOCS --enable_neon=false

 api_test:
  stage: api_test
@@ -68,14 +73,19 @@ extra_tests:
  stage: extra_tests
  script:
    - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
-    - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
+    - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS

 platform_compatible_tests:
  stage: platform_compatible_tests
  script:
    - bazel build mace/core:core --define openmp=true
-    - bazel build --config arm_linux --define openmp=true --define opencl=true --define neon=true //mace/libmace:libmace.so
-    - bazel build --config aarch64_linux --define openmp=true --define opencl=true --define neon=true //mace/libmace:libmace.so
+    - bazel build --config arm_linux_gnueabihf --define openmp=true --define opencl=true --define neon=true //mace/libmace:libmace.so
+    - bazel build --config aarch64_linux_gnu --define openmp=true --define opencl=true --define neon=true //mace/libmace:libmace.so

 build_libraries:
  stage: build_libraries
@@ -87,6 +97,11 @@ ndk_versions_compatible_tests:
  script:
    - DEFAULT_NDK_PATH=$ANDROID_NDK_HOME
    - prefix_path=${DEFAULT_NDK_PATH%android-ndk-*}
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
    - >
      for ndk in android-ndk-r12b android-ndk-r15c android-ndk-r16 android-ndk-r17b;
      do
@@ -96,8 +111,8 @@ ndk_versions_compatible_tests:
        export PATH=$ANDROID_NDK_HOME:$PATH;
        echo "ndk path: $ANDROID_NDK_HOME";
        if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
-        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
-        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --args="--gtest_filter=ActivationOpTest*";
+        python tools/bazel_adb_run.py --target="//mace/ops:ops_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS --enable_neon=false --args="--gtest_filter=ActivationOpTest*";
      fi
      done
    - export ANDROID_NDK_HOME=$DEFAULT_NDK_PATH
@@ -111,16 +126,27 @@ python_tools_tests:
    - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
    - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2.yml
    - >
-      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --round=1 --target_abis=armeabi-v7a --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --example --target_abis=armeabi-v7a --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
+    - >
+      python tools/converter.py convert --config=${CONF_FILE} --target_abis=armeabi-v7a,arm64 --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;

 model_tests:
  stage: model_tests
  script:
    - pwd
    - rm -rf mace-models
+    - rm -rf generic-mobile-devices
    - GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@github.com:XiaoMi/mace-models.git
+    - >
+      if ping -c 1 v9.git.n.xiaomi.com 1>/dev/null 2>&1; then
+        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
+        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
+      fi
    - >
      for CONF_FILE in mace-models/mobilenet-v1/mobilenet-v1.yml mace-models/mobilenet-v1/mobilenet-v1-quantize-retrain.yml;
      do
@@ -131,8 +157,8 @@ model_tests:
    - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
    - >
      python tools/converter.py convert --config=${CONF_FILE} --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --device_yml=${DEVICE_CONF_FILE} --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
    - rm -rf mace-models

 build_android_demo:

--- a/docs/installation/env_requirement.rst
+++ b/docs/installation/env_requirement.rst
@@ -35,7 +35,7 @@ Required dependencies
      - Required by model validation
    * - six
      - pip install -I six==1.11.0
-      - Required for Python 2 and 3 compatibility (TODO)
+      - Required for Python 2 and 3 compatibility

 Optional dependencies
 ---------------------

--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -109,13 +109,75 @@ in one deployment file.
        sha256sum /path/to/your/file


+
 Advanced usage
 --------------

-There are two common advanced use cases:
+There are three common advanced use cases:
+  - run your model on the embedded device
  - converting model to C++ code.
  - tuning GPU kernels for a specific SoC.

+Run you model on the embedded device
+------------------
+
+MACE use ssh to connect embedded device, in this case we recommend you to push ``$HOME/.ssh/id_rsa.pub``
+to your device ``$HOME/.ssh/authorized_keys``
+
+.. code:: bash
+
+  cat ~/.ssh/id_rsa.pub | ssh -q {user}@{ip} "cat >> ~/.ssh/authorized_keys"
+
+This part will show you how to write your own device yaml config file.
+
+**Device yaml config file**
+
+The way to run your model on the embedded device is nearly the same as run on android, except you need give a device yaml config file.
+
+MACE get this yaml config via ``--device_yml`` argument, default config value is ``devices.yml``
+, when the yaml config file is not found. we treat as there is no available arm linux device, give a message
+and continue on other device such as plugged android phone.
+
+* **Example**
+
+    Here is an device yaml config demo.
+
+    .. literalinclude:: devices/demo_device_nanopi.yml
+        :language: yaml
+
+* **Configuration**
+
+.. list-table::
+    :header-rows: 1
+
+    * - Options
+      - Usage
+    * - target_abis
+      - Device supported abis, you can get it via ``dpkg --print-architecture`` and
+        ``dpkg --print-foreign-architectures`` command, if more than one abi is supported,
+        separate them by commas.
+    * - target_socs
+      - device soc, you can get it from device manual, we haven't found a way to get it in shell.
+    * - models
+      - device models full name, you can get via get ``lshw`` command (third party package, install it via your package manager).
+        see it's product value.
+    * - address
+      - Since we use ssh to connect device, ip address is required.
+    * - username
+      - login username, required.
+    * - password
+      - login password, optional when you can login into device without password
+
+
+.. note::
+
+    Some command tools:
+
+    .. code:: bash
+
+        # specify device yaml config file via --device_yml argument or put the file under working directory
+        python tools/converter.py run --config=/path/to/mace-models/mobilenet-v2/mobilenet-v2.yml --device_yml=/path/to/devices.yml
+
 Convert model(s) to C++ code
 --------------------------------

@@ -403,6 +465,7 @@ Reduce Library Size
        - It is recommended to use ``version script`` and ``strip`` feature when linking mace static library. The effect is remarkable.

 * Remove the unused ops.
+
 Remove the registration of the ops unused for your models in the ``mace/ops/ops_register.cc``,
 which will reduce the library size significantly. the final binary just link the registered ops' code.


--- a/docs/user_guide/basic_usage.rst
+++ b/docs/user_guide/basic_usage.rst
@@ -68,7 +68,8 @@ Here we use the mobilenet-v2 model as an example.

    .. note::

-        If you want to run on device/phone, please plug in at least one device/phone.
+        If you want to run on phone, please plug in at least one phone.
+        Or if you want to run on embedded device, please give a :doc:`advanced_usage`.

    .. code:: sh

@@ -245,7 +246,10 @@ to run and validate your model.
    	# Test model run time
        python tools/converter.py run --config=/path/to/your/model_deployment_file.yml --round=100

-    	# Validate the correctness by comparing the results against the
+        # If you want to run model on specified arm linux device, you should put device config file in the working directory or run with flag `--device_yml`
+        python tools/converter.py run --config=/path/to/mace-models/mobilenet-v2/mobilenet-v2.yml --device_yml=/path/to/devices.yml --example
+
+        # Validate the correctness by comparing the results against the
    	# original model and framework, measured with cosine distance for similarity.
    	python tools/converter.py run --config=/path/to/your/model_deployment_file.yml --validate


--- a/mace/BUILD
+++ b/mace/BUILD
@@ -24,6 +24,24 @@ config_setting(
    visibility = ["//visibility:public"],
 )

+config_setting(
+    name = "arm_linux_aarch64",
+    values = {
+        "crosstool_top": "//tools/aarch64_compiler:toolchain",
+        "cpu": "aarch64",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "arm_linux_armhf",
+    values = {
+        "crosstool_top": "//tools/arm_compiler:toolchain",
+        "cpu": "armeabi-v7a",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
    name = "neon_enabled",
    define_values = {

--- a/mace/examples/cli/BUILD
+++ b/mace/examples/cli/BUILD
 # Examples
 load(
    "//mace:mace.bzl",
-    "if_openmp_enabled",
    "if_android",
    "if_hexagon_enabled",
    "if_opencl_enabled",
+    "if_openmp_enabled",
 )

 cc_binary(
@@ -18,8 +18,9 @@ cc_binary(
    ]),
    linkopts = [
        "-lm",
+        "-ldl",
    ] + if_openmp_enabled([
-        "-fopenmp"
+        "-fopenmp",
    ]) + if_android([
        "-ldl",
        "-pie",
@@ -47,6 +48,7 @@ cc_binary(
    ]),
    linkopts = [
        "-lm",
+        "-ldl",
    ] + if_android([
        "-ldl",
        "-pie",
@@ -55,8 +57,7 @@ cc_binary(
    linkstatic = 0,
    deps = [
        "//external:gflags_nothreads",
-        "//mace/codegen:generated_mace_engine_factory",
        "//mace/codegen:generated_libmace",
+        "//mace/codegen:generated_mace_engine_factory",
    ],
 )
-
--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -24,6 +24,18 @@ def if_android_arm64(a):
      "//conditions:default": [],
  })

+def if_arm_linux_aarch64(a):
+  return select({
+      "//mace:arm_linux_aarch64": a,
+      "//conditions:default": [],
+  })
+
+def if_arm_linux_armhf(a):
+  return select({
+      "//mace:arm_linux_armhf": a,
+      "//conditions:default": []
+  })
+
 def if_neon_enabled(a):
  return select({
      "//mace:neon_enabled": a,
@@ -81,4 +93,3 @@ def encrypt_opencl_kernel_genrule():
      outs = ["opencl/encrypt_opencl_kernel.cc"],
      cmd = "cat $(SRCS) > $@;"
  )
-
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -233,7 +233,7 @@ void TestNxNS12(const index_t height, const index_t width) {
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                  Padding type) {
    // generate random input
-    static unsigned int seed = time(NULL);
+    // static unsigned int seed = time(NULL);
    index_t batch = 1;
    index_t channel = 32;
    index_t multiplier = 1;

--- a/mace/ops/local_response_norm.cc
+++ b/mace/ops/local_response_norm.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <algorithm>
+#include <cmath>

 #include "mace/core/operator.h"


--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -15,6 +15,7 @@
 #include "mace/ops/resize_bicubic.h"

 #include <algorithm>
+#include <cmath>
 #include <memory>
 #include <vector>


--- a/mace/ops/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <algorithm>
+#include <cmath>
 #include <vector>

 #include "mace/core/operator.h"

--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <algorithm>
+#include <cmath>
 #include <limits>
 #include <memory>
 #include <vector>
@@ -106,7 +107,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {

        float sum = 0;
        for (index_t c = 0; c < class_count; ++c) {
-          float exp_value = ::exp(input_ptr[c] - max_val);
+          float exp_value = std::exp(input_ptr[c] - max_val);
          sum += exp_value;
          output_ptr[c] = exp_value;
        }

--- a/mace/ops/strided_slice.cc
+++ b/mace/ops/strided_slice.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <algorithm>
+#include <cmath>
 #include <vector>

 #include "mace/core/operator.h"

--- a/mace/ops/transpose.cc
+++ b/mace/ops/transpose.cc
@@ -16,8 +16,9 @@
 #include <arm_neon.h>
 #endif

-#include <vector>
 #include <algorithm>
+#include <cmath>
+#include <vector>

 #include "mace/core/operator.h"
 #include "mace/ops/transpose.h"

--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -112,6 +112,8 @@ TFSupportedOps = [

 TFOpType = Enum('TFOpType', [(op, op) for op in TFSupportedOps], type=str)

+TFSupportedOps = [six.b(op) for op in TFSupportedOps]
+

 class TensorflowConverter(base_converter.ConverterInterface):
    """A class for convert tensorflow frozen model to mace model.

--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
+# Copyright 2018 Xiaomi, Inc.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import operator
+
+import six
+from six.moves import reduce
+
+from mace.proto import mace_pb2
+
+from mace.python.tools.converter_tool import base_converter as cvt
+from mace.python.tools.converter_tool.base_converter import DeviceType
+from mace.python.tools.converter_tool.base_converter import ConverterUtil
+from mace.python.tools.converter_tool.base_converter import MaceKeyword
+from mace.python.tools.convert_util import calculate_image_shape
+from mace.python.tools.convert_util import OpenCLBufferType
+
+
+def MemoryTypeToStr(mem_type):
+    if mem_type == mace_pb2.CPU_BUFFER:
+        return 'CPU_BUFFER'
+    elif mem_type == mace_pb2.GPU_BUFFER:
+        return 'GPU_BUFFER'
+    elif mem_type == mace_pb2.GPU_IMAGE:
+        return 'GPU_IMAGE'
+    else:
+        return 'UNKNOWN'
+
+
+class MemoryBlock(object):
+    def __init__(self, mem_type, block):
+        self._mem_type = mem_type
+        self._block = block
+
+    @property
+    def mem_type(self):
+        return self._mem_type
+
+    @property
+    def block(self):
+        return self._block
+
+
+class MemoryOptimizer(object):
+    def __init__(self, net_def):
+        self.net_def = net_def
+        self.idle_mem = set()
+        self.op_mem = {}  # op_name->mem_id
+        self.mem_block = {}  # mem_id->[size] or mem_id->[x, y]
+        self.total_mem_count = 0
+        self.input_ref_counter = {}
+        self.mem_ref_counter = {}
+        ocl_mem_type_arg = ConverterUtil.get_arg(
+            net_def, MaceKeyword.mace_opencl_mem_type)
+        self.cl_mem_type = ocl_mem_type_arg.i if ocl_mem_type_arg is not None \
+            else None
+
+        consumers = {}
+        for op in net_def.op:
+            if not self.op_need_optimize_memory(op):
+                continue
+            for ipt in op.input:
+                if ipt not in consumers:
+                    consumers[ipt] = []
+                consumers[ipt].append(op)
+        # only ref op's output tensor
+        for op in net_def.op:
+            if not self.op_need_optimize_memory(op):
+                continue
+            for output in op.output:
+                tensor_name = output
+                if tensor_name in consumers:
+                    self.input_ref_counter[tensor_name] = \
+                        len(consumers[tensor_name])
+                else:
+                    self.input_ref_counter[tensor_name] = 0
+
+    def op_need_optimize_memory(self, op):
+        return True
+
+    def get_op_mem_block(self, op_type, output_shape, output_type):
+        data_type_size = 4
+        if output_type == mace_pb2.DT_UINT8:
+            data_type_size = 1
+        return MemoryBlock(mace_pb2.CPU_BUFFER,
+                           [reduce(operator.mul, output_shape, 1) *
+                            data_type_size])
+
+    def mem_size(self, memory_block):
+        return memory_block.block[0]
+
+    def sub_mem_block(self, mem_block1, mem_block2):
+        return self.mem_size(mem_block1) - self.mem_size(mem_block2)
+
+    def resize_mem_block(self, old_mem_block, op_mem_block):
+        return MemoryBlock(
+            old_mem_block.mem_type,
+            [max(old_mem_block.block[0], op_mem_block.block[0])])
+
+    def add_net_mem_blocks(self):
+        for mem in self.mem_block:
+            arena = self.net_def.mem_arena
+            block = arena.mem_block.add()
+            block.mem_id = mem
+            block.device_type = DeviceType.CPU.value
+            block.mem_type = self.mem_block[mem].mem_type
+            block.x = self.mem_block[mem].block[0]
+            block.y = 1
+
+    def get_total_origin_mem_size(self):
+        origin_mem_size = 0
+        for op in self.net_def.op:
+            if not self.op_need_optimize_memory(op):
+                continue
+            origin_mem_size += reduce(operator.mul,
+                                      op.output_shape[0].dims,
+                                      1)
+        return origin_mem_size
+
+    def get_total_optimized_mem_size(self):
+        optimized_mem_size = 0
+        for mem in self.mem_block:
+            print(mem, MemoryTypeToStr(self.mem_block[mem].mem_type),
+                  self.mem_block[mem].block)
+            optimized_mem_size += self.mem_size(self.mem_block[mem])
+        return optimized_mem_size
+
+    @staticmethod
+    def is_memory_reuse_op(op):
+        return op.type == 'Reshape' or op.type == 'Identity' \
+               or op.type == 'Squeeze' or op.type == 'ExpandDims'
+
+    def optimize(self):
+        for op in self.net_def.op:
+            if not self.op_need_optimize_memory(op):
+                continue
+            if not op.output_shape:
+                six.print_("WARNING: There is no output shape information to "
+                           "do memory optimization. %s (%s)" %
+                           (op.name, op.type), file=sys.stderr)
+                return
+            if len(op.output_shape) != len(op.output):
+                six.print_('WARNING: the number of output shape is '
+                           'not equal to the number of output.',
+                           file=sys.stderr)
+                return
+            for i in range(len(op.output)):
+                if self.is_memory_reuse_op(op):
+                    # make these ops reuse memory of input tensor
+                    mem_id = self.op_mem.get(op.input[0], -1)
+                else:
+                    output_type = mace_pb2.DT_FLOAT
+                    for arg in op.arg:
+                        if arg.name == 'T':
+                            output_type = arg.i
+                    if len(op.output_type) > i:
+                        output_type = op.output_type[i]
+                    op_mem_block = self.get_op_mem_block(
+                        op.type,
+                        op.output_shape[i].dims,
+                        output_type)
+                    mem_id = -1
+                    if len(self.idle_mem) > 0:
+                        best_mem_add_size = six.MAXSIZE
+                        best_mem_waste_size = six.MAXSIZE
+                        for mid in self.idle_mem:
+                            old_mem_block = self.mem_block[mid]
+                            if old_mem_block.mem_type != op_mem_block.mem_type:
+                                continue
+                            new_mem_block = self.resize_mem_block(
+                                old_mem_block, op_mem_block)
+                            add_mem_size = self.sub_mem_block(new_mem_block,
+                                                              old_mem_block)
+                            waste_mem_size = self.sub_mem_block(new_mem_block,
+                                                                op_mem_block)
+
+                            # minimize add_mem_size; if best_mem_add_size is 0,
+                            # then minimize waste_mem_size
+                            if (best_mem_add_size > 0 and
+                                add_mem_size < best_mem_add_size) \
+                                    or (best_mem_add_size == 0 and
+                                        waste_mem_size < best_mem_waste_size):
+                                best_mem_id = mid
+                                best_mem_add_size = add_mem_size
+                                best_mem_waste_size = waste_mem_size
+                                best_mem_block = new_mem_block
+
+                        # if add mem size < op mem size, then reuse it
+                        if best_mem_add_size <= self.mem_size(op_mem_block):
+                            self.mem_block[best_mem_id] = best_mem_block
+                            mem_id = best_mem_id
+                            self.idle_mem.remove(mem_id)
+
+                    if mem_id == -1:
+                        mem_id = self.total_mem_count
+                        self.total_mem_count += 1
+                        self.mem_block[mem_id] = op_mem_block
+
+                if mem_id != -1:
+                    op.mem_id.extend([mem_id])
+                    self.op_mem[op.output[i]] = mem_id
+                    if mem_id not in self.mem_ref_counter:
+                        self.mem_ref_counter[mem_id] = 1
+                    else:
+                        self.mem_ref_counter[mem_id] += 1
+
+            # de-ref input tensor mem
+            for idx in six.moves.range(len(op.input)):
+                ipt = op.input[idx]
+                if ipt in self.input_ref_counter:
+                    self.input_ref_counter[ipt] -= 1
+                    if self.input_ref_counter[ipt] == 0 \
+                            and ipt in self.op_mem:
+                        mem_id = self.op_mem[ipt]
+                        self.mem_ref_counter[mem_id] -= 1
+                        if self.mem_ref_counter[mem_id] == 0:
+                            self.idle_mem.add(self.op_mem[ipt])
+                    elif self.input_ref_counter[ipt] < 0:
+                        raise Exception('ref count is less than 0')
+
+        self.add_net_mem_blocks()
+
+        print("total op: %d" % len(self.net_def.op))
+        print("origin mem: %d, optimized mem: %d" % (
+            self.get_total_origin_mem_size(),
+            self.get_total_optimized_mem_size()))
+
+
+class GPUMemoryOptimizer(MemoryOptimizer):
+    def op_need_optimize_memory(self, op):
+        if op.type == MaceKeyword.mace_buffer_transform:
+            for arg in op.arg:
+                if arg.name == 'mode' and arg.i == 0:
+                    return False
+        return op.type != MaceKeyword.mace_buffer_inverse_transform
+
+    def get_op_image_mem_block(self, op_type, output_shape):
+        if op_type == 'WinogradTransform' or op_type == 'MatMul':
+            buffer_shape = list(output_shape) + [1]
+            mem_block = MemoryBlock(
+                mace_pb2.GPU_IMAGE,
+                calculate_image_shape(OpenCLBufferType.IN_OUT_HEIGHT,
+                                      buffer_shape))
+        elif op_type in ['Shape',
+                         'InferConv2dShape',
+                         'StridedSlice',
+                         'Stack',
+                         'ScalarMath']:
+            if len(output_shape) == 1:
+                mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
+                                        [output_shape[0], 1])
+            elif len(output_shape) == 0:
+                mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
+                                        [1, 1])
+            else:
+                raise Exception('%s output shape dim size is not 0 or 1.' %
+                                op_type)
+        else:
+            if len(output_shape) == 2:  # only support fc/softmax
+                buffer_shape = [output_shape[0], output_shape[1]]
+            elif len(output_shape) == 4:
+                buffer_shape = output_shape
+            else:
+                raise Exception('%s output shape dim size is not 2 or 4.' %
+                                op_type)
+            mem_block = MemoryBlock(
+                mace_pb2.GPU_IMAGE,
+                calculate_image_shape(OpenCLBufferType.IN_OUT_CHANNEL,
+                                      buffer_shape))
+        return mem_block
+
+    def get_op_buffer_mem_block(self, output_shape):
+        return MemoryBlock(mace_pb2.GPU_BUFFER,
+                           [reduce(operator.mul, output_shape, 1), 1])
+
+    def get_op_mem_block(self, op_type, output_shape, output_type):
+        if self.cl_mem_type == mace_pb2.GPU_IMAGE:
+            return self.get_op_image_mem_block(op_type, output_shape)
+        else:
+            return self.get_op_buffer_mem_block(output_shape)
+
+    def mem_size(self, memory_block):
+        if memory_block.mem_type == mace_pb2.GPU_IMAGE:
+            return memory_block.block[0] * memory_block.block[1] * 4
+        else:
+            return memory_block.block[0]
+
+    def resize_mem_block(self, old_mem_block, op_mem_block):
+        resize_mem_block = MemoryBlock(
+            old_mem_block.mem_type,
+            [
+                max(old_mem_block.block[0], op_mem_block.block[0]),
+                max(old_mem_block.block[1], op_mem_block.block[1])
+            ])
+
+        return resize_mem_block
+
+    def add_net_mem_blocks(self):
+        max_image_size_x = 0
+        max_image_size_y = 0
+        for mem in self.mem_block:
+            arena = self.net_def.mem_arena
+            block = arena.mem_block.add()
+            block.mem_id = mem
+            block.device_type = DeviceType.GPU.value
+            block.mem_type = self.mem_block[mem].mem_type
+            block.x = self.mem_block[mem].block[0]
+            block.y = self.mem_block[mem].block[1]
+            if self.mem_block[mem].mem_type == mace_pb2.GPU_IMAGE:
+                max_image_size_x = max(max_image_size_x, block.x)
+                max_image_size_y = max(max_image_size_y, block.y)
+
+        if self.cl_mem_type == mace_pb2.GPU_IMAGE:
+            # Update OpenCL max image size
+            net_ocl_max_img_size_arg = None
+            for arg in self.net_def.arg:
+                if arg.name == cvt.MaceKeyword.mace_opencl_max_image_size:
+                    net_ocl_max_img_size_arg = arg
+                    max_image_size_x = max(arg.ints[0], max_image_size_x)
+                    max_image_size_y = max(arg.ints[1], max_image_size_y)
+                    break
+            if net_ocl_max_img_size_arg is None:
+                net_ocl_max_img_size_arg = self.net_def.arg.add()
+                net_ocl_max_img_size_arg.name = \
+                    cvt.MaceKeyword.mace_opencl_max_image_size
+
+            net_ocl_max_img_size_arg.ints[:] = [max_image_size_x,
+                                                max_image_size_y]
+
+
+def optimize_gpu_memory(net_def):
+    mem_optimizer = GPUMemoryOptimizer(net_def)
+    mem_optimizer.optimize()
+
+
+def optimize_cpu_memory(net_def):
+    mem_optimizer = MemoryOptimizer(net_def)
+    mem_optimizer.optimize()
--- a/mace/python/tools/model_saver.py
+++ b/mace/python/tools/model_saver.py
@@ -14,6 +14,7 @@

 import datetime
 import os
+import six
 import uuid
 import numpy as np
 import hashlib
@@ -34,8 +35,8 @@ class ModelFormat(object):

 def generate_obfuscated_name(namespace, name):
    md5 = hashlib.md5()
-    md5.update(namespace)
-    md5.update(name)
+    md5.update(six.b(namespace))
+    md5.update(six.b(name))
    md5_digest = md5.hexdigest()

    name = md5_digest[:8]

--- a/mace/utils/quantize.h
+++ b/mace/utils/quantize.h
@@ -15,8 +15,9 @@
 #ifndef MACE_UTILS_QUANTIZE_H_
 #define MACE_UTILS_QUANTIZE_H_

-#include <limits>
 #include <algorithm>
+#include <cmath>
+#include <limits>

 namespace mace {


--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -21,30 +21,29 @@ build:android --config=cross_compile
 build:android --crosstool_top=//external:android/crosstool
 build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain

-# Usage example: bazel build --config arm_linux
-build:arm_linux --config=cross_compile
-build:arm_linux --crosstool_top=//tools/arm_compiler:toolchain
-build:arm_linux --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
-build:arm_linux --cpu=armeabi-v7a
-build:arm_linux --copt -mfloat-abi=hard
-build:arm_linux --copt -mfpu=neon 
-build:arm_linux --copt -Wno-ignored-attributes
-build:arm_linux --copt -Wno-unused-function
-build:arm_linux --copt -Wno-sequence-point
-build:arm_linux --copt -Wno-implicit-fallthrough
-build:arm_linux --copt -Wno-psabi
+# Usage example: bazel build --config arm_linux_gnueabihf
+build:arm_linux_gnueabihf --config=cross_compile
+build:arm_linux_gnueabihf --crosstool_top=//tools/arm_compiler:toolchain
+build:arm_linux_gnueabihf --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:arm_linux_gnueabihf --cpu=armeabi-v7a
+build:arm_linux_gnueabihf --copt -mfloat-abi=hard
+build:arm_linux_gnueabihf --copt -mfpu=neon
+build:arm_linux_gnueabihf --copt -Wno-ignored-attributes
+build:arm_linux_gnueabihf --copt -Wno-unused-function
+build:arm_linux_gnueabihf --copt -Wno-sequence-point
+build:arm_linux_gnueabihf --copt -Wno-implicit-fallthrough

-# Usage example: bazel build --config aarch64_linux
-build:aarch64_linux --config=cross_compile
-build:aarch64_linux --crosstool_top=//tools/aarch64_compiler:toolchain
-build:aarch64_linux --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
-build:aarch64_linux --cpu=aarch64
-build:aarch64_linux --copt -Wno-ignored-attributes
-build:aarch64_linux --copt -Wno-unused-function
-build:aarch64_linux --copt -Wno-sequence-point
-build:aarch64_linux --copt -Wno-implicit-fallthrough
+# Usage example: bazel build --config aarch64_linux_gnu
+build:aarch64_linux_gnu --config=cross_compile
+build:aarch64_linux_gnu --crosstool_top=//tools/aarch64_compiler:toolchain
+build:aarch64_linux_gnu --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:aarch64_linux_gnu --cpu=aarch64
+build:aarch64_linux_gnu --copt -Wno-ignored-attributes
+build:aarch64_linux_gnu --copt -Wno-unused-function
+build:aarch64_linux_gnu --copt -Wno-sequence-point
+build:aarch64_linux_gnu --copt -Wno-implicit-fallthrough

-# Usage example: bazel build --config optimization 
+# Usage example: bazel build --config optimization
 build:optimization -c opt
 build:optimization --copt=-O3
 build:optimization --linkopt=-Wl,--strip-all

--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -26,9 +26,9 @@ import sys

 import sh_commands

+from common import *

-def stdout_processor(stdout, device_properties, abi):
-    pass
+from device import DeviceWrapper, DeviceManager


 def unittest_stdout_processor(stdout, device_properties, abi):
@@ -39,7 +39,7 @@ def unittest_stdout_processor(stdout, device_properties, abi):
            raise Exception("Command failed")


-def ops_benchmark_stdout_processor(stdout, device_properties, abi):
+def ops_benchmark_stdout_processor(stdout, dev, abi):
    stdout_lines = stdout.split("\n")
    metrics = {}
    for line in stdout_lines:
@@ -52,8 +52,8 @@ def ops_benchmark_stdout_processor(stdout, device_properties, abi):
            metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
            metrics["%s.gmacc_per_sec" % parts[0]] = parts[4]

-    platform = device_properties["ro.board.platform"].replace(" ", "-")
-    model = device_properties["ro.product.model"].replace(" ", "-")
+    platform = dev[YAMLKeyword.target_socs]
+    model = dev[YAMLKeyword.models]
    tags = {
        "ro.board.platform": platform,
        "ro.product.model": model,
@@ -87,7 +87,7 @@ def parse_args():
        type=str,
        default="all",
        help="SoCs (ro.board.platform from getprop) to build, "
-        "comma seperated list or all/random")
+             "comma seperated list or all/random")
    parser.add_argument(
        "--target", type=str, default="//...", help="Bazel target to build")
    parser.add_argument(
@@ -115,14 +115,22 @@ def parse_args():
        type=str2bool,
        default=False,
        help="Whether to use simpleperf stat")
+    parser.add_argument(
+        '--device_yml',
+        type=str,
+        default='',
+        help='embedded linux device config yml file'
+    )
    return parser.parse_known_args()


 def main(unused_args):
    target_socs = None
+    target_devices = DeviceManager.list_devices(FLAGS.device_yml)
    if FLAGS.target_socs != "all" and FLAGS.target_socs != "random":
        target_socs = set(FLAGS.target_socs.split(','))
-    target_devices = sh_commands.get_target_socs_serialnos(target_socs)
+        target_devices = [dev for dev in target_devices
+                          if dev[YAMLKeyword.target_socs] in target_socs]
    if FLAGS.target_socs == "random":
        unlocked_devices = \
            [d for d in target_devices if not sh_commands.is_device_locked(d)]
@@ -136,31 +144,29 @@ def main(unused_args):
    target_abis = FLAGS.target_abis.split(',')

    for target_abi in target_abis:
+        toolchain = infer_toolchain(target_abi)
        sh_commands.bazel_build(target, abi=target_abi,
+                                toolchain=toolchain,
                                enable_neon=FLAGS.enable_neon,
                                address_sanitizer=FLAGS.address_sanitizer)
        if FLAGS.run_target:
-            for serialno in target_devices:
-                if target_abi not in set(
-                        sh_commands.adb_supported_abis(serialno)):
+            for dev in target_devices:
+                if target_abi not in dev[YAMLKeyword.target_abis]:
                    print("Skip device %s which does not support ABI %s" %
-                          (serialno, target_abi))
+                          (dev, target_abi))
                    continue
-                stdouts = sh_commands.adb_run(
+                device_wrapper = DeviceWrapper(dev)
+                stdouts = device_wrapper.run(
                    target_abi,
-                    serialno,
                    host_bin_path,
                    bin_name,
                    args=FLAGS.args,
                    opencl_profiling=True,
                    vlog_level=0,
-                    device_bin_path="/data/local/tmp/mace",
                    out_of_range_check=True,
                    address_sanitizer=FLAGS.address_sanitizer,
                    simpleperf=FLAGS.simpleperf)
-                device_properties = sh_commands.adb_getprop_by_serialno(
-                    serialno)
-                globals()[FLAGS.stdout_processor](stdouts, device_properties,
+                globals()[FLAGS.stdout_processor](stdouts, dev,
                                                  target_abi)



--- a/tools/build-standalone-lib.sh
+++ b/tools/build-standalone-lib.sh
@@ -22,6 +22,14 @@ mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu
 rm -rf $LIB_DIR/linux-x86-64
 mkdir -p $LIB_DIR/linux-x86-64

+rm -rf $LIB_DIR/arm_linux_gnueabihf
+mkdir -p $LIB_DIR/arm_linux_gnueabihf/cpu_gpu
+
+rm -rf $LIB_DIR/aarch64_linux_gnu
+mkdir -p $LIB_DIR/aarch64_linux_gnu/cpu_gpu
+
+
+
 # build shared libraries
 echo "build shared lib for armeabi-v7a + cpu_gpu_dsp"
 bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a
@@ -36,6 +44,14 @@ echo "build shared lib for arm64-v8a + cpu_gpu"
 bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=arm64-v8a
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu/

+echo "build shared lib for arm_linux_gnueabihf + cpu_gpu"
+bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true
+cp bazel-bin/mace/libmace/libmace.so  $LIB_DIR/arm_linux_gnueabihf/cpu_gpu/
+
+echo "build shared lib for aarch64_linux_gnu + cpu_gpu"
+bazel build --config aarch64_linux_gnu  --config optimization mace/libmace:libmace_dynamic  --define neon=true --define openmp=true --define opencl=true
+cp bazel-bin/mace/libmace/libmace.so  $LIB_DIR/aarch64_linux_gnu/cpu_gpu/
+
 if [[ "$OSTYPE" != "darwin"* ]];then
 	echo "build shared lib for linux-x86-64"
 	bazel build mace/libmace:libmace_dynamic --config optimization --define quantize=true --define openmp=true
@@ -56,6 +72,14 @@ echo "build static lib for arm64-v8a + cpu_gpu"
 bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=arm64-v8a
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu/

+echo "build static lib for arm_linux_gnueabihf + cpu_gpu"
+bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true
+cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm_linux_gnueabihf/cpu_gpu/
+
+echo "build static lib for aarch64_linux_gnu + cpu_gpu"
+bazel build --config aarch64_linux_gnu --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true
+cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/aarch64_linux_gnu/cpu_gpu/
+
 if [[ "$OSTYPE" != "darwin"* ]];then
 	echo "build static lib for linux-x86-64"
 	bazel build mace/libmace:libmace_static --config optimization --define quantize=true --define openmp=true

--- a/tools/common.py
+++ b/tools/common.py
@@ -13,7 +13,9 @@
 # limitations under the License.

 import enum
+import hashlib
 import re
+import os

 import six

@@ -135,3 +137,340 @@ def formatted_file_name(input_file_name, input_name):
    for c in input_name:
        res += c if c.isalnum() else '_'
    return res
+
+
+def md5sum(s):
+    md5 = hashlib.md5()
+    md5.update(s.encode('utf-8'))
+    return md5.hexdigest()
+
+
+def get_build_binary_dir(library_name, target_abi):
+    return "%s/%s/%s/%s" % (
+        BUILD_OUTPUT_DIR, library_name, BUILD_TMP_DIR_NAME, target_abi)
+
+
+def get_model_lib_output_path(library_name, abi):
+    lib_output_path = os.path.join(BUILD_OUTPUT_DIR, library_name,
+                                   MODEL_OUTPUT_DIR_NAME, abi,
+                                   "%s.a" % library_name)
+    return lib_output_path
+
+
+def check_model_converted(library_name, model_name,
+                          model_graph_format, model_data_format,
+                          abi):
+    model_output_dir = \
+        '%s/%s/%s' % (BUILD_OUTPUT_DIR, library_name, MODEL_OUTPUT_DIR_NAME)
+    if model_graph_format == ModelFormat.file:
+        mace_check(os.path.exists("%s/%s.pb" % (model_output_dir, model_name)),
+                   ModuleName.RUN,
+                   "You should convert model first.")
+    else:
+        model_lib_path = get_model_lib_output_path(library_name, abi)
+        mace_check(os.path.exists(model_lib_path),
+                   ModuleName.RUN,
+                   "You should convert model first.")
+    if model_data_format == ModelFormat.file:
+        mace_check(os.path.exists("%s/%s.data" %
+                                  (model_output_dir, model_name)),
+                   ModuleName.RUN,
+                   "You should convert model first.")
+
+
+def parse_device_type(runtime):
+    device_type = ""
+
+    if runtime == RuntimeType.dsp:
+        device_type = DeviceType.HEXAGON
+    elif runtime == RuntimeType.gpu:
+        device_type = DeviceType.GPU
+    elif runtime == RuntimeType.cpu:
+        device_type = DeviceType.CPU
+
+    return device_type
+
+
+def sha256_checksum(fname):
+    hash_func = hashlib.sha256()
+    with open(fname, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_func.update(chunk)
+    return hash_func.hexdigest()
+
+
+def get_model_files(model_file_path,
+                    model_sha256_checksum,
+                    model_output_dir,
+                    weight_file_path="",
+                    weight_sha256_checksum=""):
+    model_file = model_file_path
+    weight_file = weight_file_path
+
+    if model_file_path.startswith("http://") or \
+            model_file_path.startswith("https://"):
+        model_file = model_output_dir + "/" + md5sum(model_file_path) + ".pb"
+        if not os.path.exists(model_file) or \
+                sha256_checksum(model_file) != model_sha256_checksum:
+            MaceLogger.info("Downloading model, please wait ...")
+            six.moves.urllib.request.urlretrieve(model_file_path, model_file)
+            MaceLogger.info("Model downloaded successfully.")
+
+    if sha256_checksum(model_file) != model_sha256_checksum:
+        MaceLogger.error(ModuleName.MODEL_CONVERTER,
+                         "model file sha256checksum not match")
+
+    if weight_file_path.startswith("http://") or \
+            weight_file_path.startswith("https://"):
+        weight_file = \
+            model_output_dir + "/" + md5sum(weight_file_path) + ".caffemodel"
+        if not os.path.exists(weight_file) or \
+                sha256_checksum(weight_file) != weight_sha256_checksum:
+            MaceLogger.info("Downloading model weight, please wait ...")
+            six.moves.urllib.request.urlretrieve(weight_file_path, weight_file)
+            MaceLogger.info("Model weight downloaded successfully.")
+
+    if weight_file:
+        if sha256_checksum(weight_file) != weight_sha256_checksum:
+            MaceLogger.error(ModuleName.MODEL_CONVERTER,
+                             "weight file sha256checksum not match")
+
+    return model_file, weight_file
+
+
+def get_opencl_binary_output_path(library_name, target_abi, device):
+    target_soc = device.target_socs
+    device_model = device.models
+    return '%s/%s/%s/%s/%s_%s.%s.%s.bin' % \
+           (BUILD_OUTPUT_DIR,
+            library_name,
+            OUTPUT_OPENCL_BINARY_DIR_NAME,
+            target_abi,
+            library_name,
+            OUTPUT_OPENCL_BINARY_FILE_NAME,
+            device_model,
+            target_soc)
+
+
+def get_opencl_parameter_output_path(library_name, target_abi, device):
+    target_soc = device.target_socs
+    device_model = device.models
+    return '%s/%s/%s/%s/%s_%s.%s.%s.bin' % \
+           (BUILD_OUTPUT_DIR,
+            library_name,
+            OUTPUT_OPENCL_BINARY_DIR_NAME,
+            target_abi,
+            library_name,
+            OUTPUT_OPENCL_PARAMETER_FILE_NAME,
+            device_model,
+            target_soc)
+
+
+def get_build_model_dirs(library_name,
+                         model_name,
+                         target_abi,
+                         device,
+                         model_file_path):
+    models = device.models
+    target_socs = device.target_socs
+    model_path_digest = md5sum(model_file_path)
+    model_output_base_dir = '{}/{}/{}/{}/{}'.format(
+        BUILD_OUTPUT_DIR, library_name, BUILD_TMP_DIR_NAME,
+        model_name, model_path_digest)
+
+    if target_abi == ABIType.host:
+        model_output_dir = '%s/%s' % (model_output_base_dir, target_abi)
+    elif not target_socs or not device.address:
+        model_output_dir = '%s/%s/%s' % (model_output_base_dir,
+                                         BUILD_TMP_GENERAL_OUTPUT_DIR_NAME,
+                                         target_abi)
+    else:
+        model_output_dir = '{}/{}_{}/{}'.format(
+            model_output_base_dir,
+            models,
+            target_socs,
+            target_abi
+        )
+
+    mace_model_dir = '{}/{}/{}'.format(
+        BUILD_OUTPUT_DIR, library_name, MODEL_OUTPUT_DIR_NAME
+    )
+
+    return model_output_base_dir, model_output_dir, mace_model_dir
+
+
+def abi_to_internal(abi):
+    if abi in [ABIType.armeabi_v7a, ABIType.arm64_v8a]:
+        return abi
+    if abi == ABIType.arm64:
+        return ABIType.aarch64
+    if abi == ABIType.armhf:
+        return ABIType.armeabi_v7a
+
+
+def infer_toolchain(abi):
+    if abi in [ABIType.armeabi_v7a, ABIType.arm64_v8a]:
+        return ToolchainType.android
+    if abi == ABIType.armhf:
+        return ToolchainType.arm_linux_gnueabihf
+    if abi == ABIType.arm64:
+        return ToolchainType.aarch64_linux_gnu
+    return ''
+
+
+################################
+# YAML key word
+################################
+class YAMLKeyword(object):
+    library_name = 'library_name'
+    target_abis = 'target_abis'
+    target_socs = 'target_socs'
+    model_graph_format = 'model_graph_format'
+    model_data_format = 'model_data_format'
+    models = 'models'
+    platform = 'platform'
+    device_name = 'device_name'
+    system = 'system'
+    address = 'address'
+    username = 'username'
+    password = 'password'
+    model_file_path = 'model_file_path'
+    model_sha256_checksum = 'model_sha256_checksum'
+    weight_file_path = 'weight_file_path'
+    weight_sha256_checksum = 'weight_sha256_checksum'
+    subgraphs = 'subgraphs'
+    input_tensors = 'input_tensors'
+    input_shapes = 'input_shapes'
+    input_ranges = 'input_ranges'
+    output_tensors = 'output_tensors'
+    output_shapes = 'output_shapes'
+    check_tensors = 'check_tensors'
+    check_shapes = 'check_shapes'
+    runtime = 'runtime'
+    data_type = 'data_type'
+    input_data_types = 'input_data_types'
+    input_data_formats = 'input_data_formats'
+    output_data_formats = 'output_data_formats'
+    limit_opencl_kernel_time = 'limit_opencl_kernel_time'
+    nnlib_graph_mode = 'nnlib_graph_mode'
+    obfuscate = 'obfuscate'
+    winograd = 'winograd'
+    quantize = 'quantize'
+    quantize_range_file = 'quantize_range_file'
+    change_concat_ranges = 'change_concat_ranges'
+    validation_inputs_data = 'validation_inputs_data'
+    validation_threshold = 'validation_threshold'
+    graph_optimize_options = 'graph_optimize_options'  # internal use for now
+    cl_mem_type = 'cl_mem_type'
+
+
+################################
+# SystemType
+################################
+class SystemType:
+    host = 'host'
+    android = 'android'
+    arm_linux = 'arm_linux'
+
+
+################################
+# common device str
+################################
+
+PHONE_DATA_DIR = '/data/local/tmp/mace_run'
+DEVICE_DATA_DIR = '/tmp/data/mace_run'
+DEVICE_INTERIOR_DIR = PHONE_DATA_DIR + "/interior"
+BUILD_OUTPUT_DIR = 'builds'
+BUILD_TMP_DIR_NAME = '_tmp'
+BUILD_DOWNLOADS_DIR = BUILD_OUTPUT_DIR + '/downloads'
+BUILD_TMP_GENERAL_OUTPUT_DIR_NAME = 'general'
+MODEL_OUTPUT_DIR_NAME = 'model'
+EXAMPLE_STATIC_NAME = "example_static"
+EXAMPLE_DYNAMIC_NAME = "example_dynamic"
+EXAMPLE_STATIC_TARGET = "//mace/examples/cli:" + EXAMPLE_STATIC_NAME
+EXAMPLE_DYNAMIC_TARGET = "//mace/examples/cli:" + EXAMPLE_DYNAMIC_NAME
+MACE_RUN_STATIC_NAME = "mace_run_static"
+MACE_RUN_DYNAMIC_NAME = "mace_run_dynamic"
+MACE_RUN_STATIC_TARGET = "//mace/tools/validation:" + MACE_RUN_STATIC_NAME
+MACE_RUN_DYNAMIC_TARGET = "//mace/tools/validation:" + MACE_RUN_DYNAMIC_NAME
+CL_COMPILED_BINARY_FILE_NAME = "mace_cl_compiled_program.bin"
+BUILD_TMP_OPENCL_BIN_DIR = 'opencl_bin'
+LIBMACE_DYNAMIC_PATH = "bazel-bin/mace/libmace/libmace.so"
+CL_TUNED_PARAMETER_FILE_NAME = "mace_run.config"
+MODEL_HEADER_DIR_PATH = 'include/mace/public'
+OUTPUT_LIBRARY_DIR_NAME = 'lib'
+OUTPUT_OPENCL_BINARY_DIR_NAME = 'opencl'
+OUTPUT_OPENCL_BINARY_FILE_NAME = 'compiled_opencl_kernel'
+OUTPUT_OPENCL_PARAMETER_FILE_NAME = 'tuned_opencl_parameter'
+CODEGEN_BASE_DIR = 'mace/codegen'
+MODEL_CODEGEN_DIR = CODEGEN_BASE_DIR + '/models'
+ENGINE_CODEGEN_DIR = CODEGEN_BASE_DIR + '/engine'
+LIB_CODEGEN_DIR = CODEGEN_BASE_DIR + '/lib'
+LIBMACE_SO_TARGET = "//mace/libmace:libmace.so"
+LIBMACE_STATIC_TARGET = "//mace/libmace:libmace_static"
+LIBMACE_STATIC_PATH = "bazel-genfiles/mace/libmace/libmace.a"
+MODEL_LIB_TARGET = "//mace/codegen:generated_models"
+MODEL_LIB_PATH = "bazel-genfiles/mace/codegen/libgenerated_models.a"
+QUANTIZE_STAT_TARGET = "//mace/tools/quantization:quantize_stat"
+BM_MODEL_STATIC_NAME = "benchmark_model_static"
+BM_MODEL_DYNAMIC_NAME = "benchmark_model_dynamic"
+BM_MODEL_STATIC_TARGET = "//mace/benchmark:" + BM_MODEL_STATIC_NAME
+BM_MODEL_DYNAMIC_TARGET = "//mace/benchmark:" + BM_MODEL_DYNAMIC_NAME
+ALL_SOC_TAG = 'all'
+
+
+################################
+# Model File Format
+################################
+class ModelFormat(object):
+    file = 'file'
+    code = 'code'
+
+
+################################
+# ABI Type
+################################
+class ABIType(object):
+    armeabi_v7a = 'armeabi-v7a'
+    arm64_v8a = 'arm64-v8a'
+    arm64 = 'arm64'
+    aarch64 = 'aarch64'
+    armhf = 'armhf'
+    host = 'host'
+
+
+################################
+# Module name
+################################
+class ModuleName(object):
+    YAML_CONFIG = 'YAML CONFIG'
+    MODEL_CONVERTER = 'Model Converter'
+    RUN = 'RUN'
+    BENCHMARK = 'Benchmark'
+
+
+#################################
+# mace lib type
+#################################
+class MACELibType(object):
+    static = 0
+    dynamic = 1
+
+
+#################################
+# Run time type
+#################################
+class RuntimeType(object):
+    cpu = 'cpu'
+    gpu = 'gpu'
+    dsp = 'dsp'
+    cpu_gpu = 'cpu+gpu'
+
+
+#################################
+# Tool chain Type
+#################################
+class ToolchainType:
+    android = 'android'
+    arm_linux_gnueabihf = 'arm_linux_gnueabihf'
+    aarch64_linux_gnu = 'aarch64_linux_gnu'
--- a/tools/converter.py
+++ b/tools/converter.py
--- a/tools/device.py
+++ b/tools/device.py
--- a/tools/image/image_to_tensor.py
+++ b/tools/image/image_to_tensor.py
 import argparse
 import os
 import sys
+
+import six
+
 import tensorflow as tf

 # TODO(liyin): use dataset api and estimator with distributed strategy
@@ -70,7 +73,7 @@ def images_to_tensors(input_files, image_shape, mean_values=None):

 def main(unused_args):
    if not os.path.exists(FLAGS.input):
-        print ("input does not exist: %s" % FLAGS.input)
+        print("input does not exist: %s" % FLAGS.input)
        sys.exit(-1)

    input_files = []

--- a/tools/image/tensor_to_image.py
+++ b/tools/image/tensor_to_image.py
 import argparse
 import os
 import sys
+
+import six
+
 import numpy as np
 import tensorflow as tf

@@ -53,7 +56,7 @@ def tensors_to_images(input_files, image_shape):

 def main(unused_args):
    if not os.path.exists(FLAGS.input):
-        print ("input does not exist: %s" % FLAGS.input)
+        print("input does not exist: %s" % FLAGS.input)
        sys.exit(-1)

    input_files = []

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -23,13 +23,16 @@ import struct
 import subprocess
 import sys
 import time
-import urllib
 import platform
-from enum import Enum

 import six

 import common
+from common import ModelFormat
+from common import ABIType
+from common import SystemType
+from common import YAMLKeyword
+from common import abi_to_internal

 sys.path.insert(0, "mace/python/tools")
 try:
@@ -89,11 +92,6 @@ class BuildType(object):
    code = 'code'


-class ModelFormat(object):
-    file = 'file'
-    code = 'code'
-
-
 def stdout_success(stdout):
    stdout_lines = stdout.split("\n")
    for line in stdout_lines:
@@ -190,7 +188,7 @@ def adb_pull(src_path, dst_path, serialno):
    try:
        sh.adb("-s", serialno, "pull", src_path, dst_path)
    except Exception as e:
-        six.print_("Error msg: %s" % e.stderr)
+        six.print_("Error msg: %s" % e, file=sys.stderr)


 def adb_run(abi,
@@ -293,7 +291,7 @@ def find_asan_rt_library(abi, asan_rt_path=''):
        if len(candidates) == 0:
            common.MaceLogger.error(
                "Toolchain",
-                "Can't find AddressSanitizer runtime library in % s" %
+                "Can't find AddressSanitizer runtime library in %s" %
                find_path)
        elif len(candidates) > 1:
            common.MaceLogger.info(
@@ -338,6 +336,7 @@ def find_simpleperf_library(abi, simpleperf_path=''):
 ################################
 def bazel_build(target,
                abi="armeabi-v7a",
+                toolchain='android',
                hexagon_mode=False,
                enable_openmp=True,
                enable_neon=True,
@@ -361,8 +360,8 @@ def bazel_build(target,
            "build",
            target,
            "--config",
-            "android",
-            "--cpu=%s" % abi,
+            toolchain,
+            "--cpu=%s" % abi_to_internal(abi),
            "--define",
            "neon=%s" % str(enable_neon).lower(),
            "--define",
@@ -694,230 +693,20 @@ def push_depended_so_libs(libmace_dynamic_library_path,
    for dep in split_stdout(dep_so_libs):
        if dep == "libgnustl_shared.so":
            adb_push(
-                    "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/%s/libgnustl_shared.so"  # noqa
-                    % (os.environ["ANDROID_NDK_HOME"], abi),
-                    phone_data_dir,
-                    serialno)
+                "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/%s/libgnustl_shared.so"  # noqa
+                % (os.environ["ANDROID_NDK_HOME"], abi),
+                phone_data_dir,
+                serialno)
        elif dep == "libc++_shared.so":
            adb_push(
-                    "%s/sources/cxx-stl/llvm-libc++/libs/%s/libc++_shared.so"  # noqa
-                    % (os.environ["ANDROID_NDK_HOME"], abi),
-                    phone_data_dir,
-                    serialno)
-
-
-def tuning_run(abi,
-               serialno,
-               target_dir,
-               target_name,
-               vlog_level,
-               embed_model_data,
-               model_output_dir,
-               input_nodes,
-               output_nodes,
-               input_shapes,
-               output_shapes,
-               mace_model_dir,
-               model_tag,
-               device_type,
-               running_round,
-               restart_round,
-               limit_opencl_kernel_time,
-               tuning,
-               out_of_range_check,
-               phone_data_dir,
-               model_graph_format,
-               opencl_binary_file,
-               opencl_parameter_file,
-               libmace_dynamic_library_path,
-               omp_num_threads=-1,
-               cpu_affinity_policy=1,
-               gpu_perf_hint=3,
-               gpu_priority_hint=3,
-               input_file_name="model_input",
-               output_file_name="model_out",
-               input_dir="",
-               output_dir="",
-               runtime_failure_ratio=0.0,
-               address_sanitizer=False,
-               link_dynamic=False,
-               quantize_stat=False):
-    six.print_("* Run '%s' with round=%s, restart_round=%s, tuning=%s, "
-               "out_of_range_check=%s, omp_num_threads=%s, "
-               "cpu_affinity_policy=%s, gpu_perf_hint=%s, "
-               "gpu_priority_hint=%s" %
-               (model_tag, running_round, restart_round, str(tuning),
-                str(out_of_range_check), omp_num_threads, cpu_affinity_policy,
-                gpu_perf_hint, gpu_priority_hint))
-    sys.stdout.flush()
-
-    mace_model_path = ""
-    if model_graph_format == ModelFormat.file:
-        mace_model_path = "%s/%s.pb" % (mace_model_dir, model_tag)
-    if abi == "host":
-        libmace_dynamic_lib_path = \
-            os.path.dirname(libmace_dynamic_library_path)
-        cmd = [
-            "env",
-            "LD_LIBRARY_PATH=%s" % libmace_dynamic_lib_path,
-            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
-            "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
-        ]
-        if quantize_stat:
-            cmd.append("MACE_LOG_TENSOR_RANGE=1")
-        cmd.extend([
-            "%s/%s" % (target_dir, target_name),
-            "--model_name=%s" % model_tag,
-            "--input_node=%s" % ",".join(input_nodes),
-            "--output_node=%s" % ",".join(output_nodes),
-            "--input_shape=%s" % ":".join(input_shapes),
-            "--output_shape=%s" % ":".join(output_shapes),
-            "--input_file=%s/%s" % (model_output_dir, input_file_name),
-            "--output_file=%s/%s" % (model_output_dir, output_file_name),
-            "--input_dir=%s" % input_dir,
-            "--output_dir=%s" % output_dir,
-            "--model_data_file=%s/%s.data" % (mace_model_dir, model_tag),
-            "--device=%s" % device_type,
-            "--round=%s" % running_round,
-            "--restart_round=%s" % restart_round,
-            "--omp_num_threads=%s" % omp_num_threads,
-            "--cpu_affinity_policy=%s" % cpu_affinity_policy,
-            "--gpu_perf_hint=%s" % gpu_perf_hint,
-            "--gpu_priority_hint=%s" % gpu_priority_hint,
-            "--model_file=%s" % mace_model_path,
-        ])
-        p = subprocess.Popen(
-            cmd,
-            stderr=subprocess.PIPE,
-            stdout=subprocess.PIPE)
-        out, err = p.communicate()
-        stdout = err + out
-        six.print_(stdout)
-        six.print_("Running finished!\n")
-    else:
-        sh.adb("-s", serialno, "shell", "mkdir", "-p", phone_data_dir)
-        internal_storage_dir = create_internal_storage_dir(
-            serialno, phone_data_dir)
-
-        for input_name in input_nodes:
-            formatted_name = common.formatted_file_name(input_file_name,
-                                                        input_name)
-            adb_push("%s/%s" % (model_output_dir, formatted_name),
-                     phone_data_dir, serialno)
-        if address_sanitizer:
-            adb_push(find_asan_rt_library(abi), phone_data_dir, serialno)
-
-        if not embed_model_data:
-            adb_push("%s/%s.data" % (mace_model_dir, model_tag),
-                     phone_data_dir, serialno)
-
-        if device_type == common.DeviceType.GPU:
-            if os.path.exists(opencl_binary_file):
-                adb_push(opencl_binary_file, phone_data_dir, serialno)
-            if os.path.exists(opencl_parameter_file):
-                adb_push(opencl_parameter_file, phone_data_dir, serialno)
-
-        adb_push("third_party/nnlib/libhexagon_controller.so",
-                 phone_data_dir, serialno)
-
-        mace_model_phone_path = ""
-        if model_graph_format == ModelFormat.file:
-            mace_model_phone_path = "%s/%s.pb" % (phone_data_dir, model_tag)
-            adb_push(mace_model_path,
-                     mace_model_phone_path,
-                     serialno)
-
-        if link_dynamic:
-            adb_push(libmace_dynamic_library_path, phone_data_dir,
-                     serialno)
-            push_depended_so_libs(libmace_dynamic_library_path, abi,
-                                  phone_data_dir, serialno)
-
-        adb_push("%s/%s" % (target_dir, target_name), phone_data_dir,
-                 serialno)
-
-        stdout_buff = []
-        process_output = make_output_processor(stdout_buff)
-        adb_cmd = [
-            "LD_LIBRARY_PATH=%s" % phone_data_dir,
-            "MACE_TUNING=%s" % int(tuning),
-            "MACE_OUT_OF_RANGE_CHECK=%s" % int(out_of_range_check),
-            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
-            "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % phone_data_dir,
-            "MACE_INTERNAL_STORAGE_PATH=%s" % internal_storage_dir,
-            "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" % limit_opencl_kernel_time,
-            "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
-        ]
-        if quantize_stat:
-            adb_cmd.append("MACE_LOG_TENSOR_RANGE=1")
-        if address_sanitizer:
-            adb_cmd.extend([
-                "LD_PRELOAD=%s/%s" % (phone_data_dir,
-                                      asan_rt_library_names(abi))
-            ])
-        adb_cmd.extend([
-            "%s/%s" % (phone_data_dir, target_name),
-            "--model_name=%s" % model_tag,
-            "--input_node=%s" % ",".join(input_nodes),
-            "--output_node=%s" % ",".join(output_nodes),
-            "--input_shape=%s" % ":".join(input_shapes),
-            "--output_shape=%s" % ":".join(output_shapes),
-            "--input_file=%s/%s" % (phone_data_dir, input_file_name),
-            "--output_file=%s/%s" % (phone_data_dir, output_file_name),
-            "--input_dir=%s" % input_dir,
-            "--output_dir=%s" % output_dir,
-            "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag),
-            "--device=%s" % device_type,
-            "--round=%s" % running_round,
-            "--restart_round=%s" % restart_round,
-            "--omp_num_threads=%s" % omp_num_threads,
-            "--cpu_affinity_policy=%s" % cpu_affinity_policy,
-            "--gpu_perf_hint=%s" % gpu_perf_hint,
-            "--gpu_priority_hint=%s" % gpu_priority_hint,
-            "--model_file=%s" % mace_model_phone_path,
-            "--opencl_binary_file=%s/%s" %
-            (phone_data_dir, os.path.basename(opencl_binary_file)),
-            "--opencl_parameter_file=%s/%s" %
-            (phone_data_dir, os.path.basename(opencl_parameter_file)),
-        ])
-        adb_cmd = ' '.join(adb_cmd)
-        cmd_file_name = "%s-%s-%s" % ('cmd_file', model_tag, str(time.time()))
-        adb_cmd_file = "%s/%s" % (phone_data_dir, cmd_file_name)
-        tmp_cmd_file = "%s/%s" % ('/tmp', cmd_file_name)
-        with open(tmp_cmd_file, 'w') as cmd_file:
-            cmd_file.write(adb_cmd)
-        adb_push(tmp_cmd_file, adb_cmd_file, serialno)
-        os.remove(tmp_cmd_file)
-
-        sh.adb(
-            "-s",
-            serialno,
-            "shell",
-            "sh",
-            adb_cmd_file,
-            _tty_in=True,
-            _out=process_output,
-            _err_to_out=True)
-        stdout = "".join(stdout_buff)
-        if not stdout_success(stdout):
-            common.MaceLogger.error("Mace Run", "Mace run failed.")
-
-        sh.adb(
-            "-s",
-            serialno,
-            "shell",
-            "rm",
-            adb_cmd_file,
-            _fg=True)
-
-        six.print_("Running finished!\n")
-
-    sys.stdout.flush()
-    return stdout
+                "%s/sources/cxx-stl/llvm-libc++/libs/%s/libc++_shared.so"  # noqa
+                % (os.environ["ANDROID_NDK_HOME"], abi),
+                phone_data_dir,
+                serialno)


 def validate_model(abi,
-                   serialno,
+                   device,
                   model_file_path,
                   weight_file_path,
                   platform,
@@ -927,7 +716,6 @@ def validate_model(abi,
                   input_shapes,
                   output_shapes,
                   model_output_dir,
-                   phone_data_dir,
                   input_data_types,
                   caffe_env,
                   input_file_name="model_input",
@@ -941,8 +729,7 @@ def validate_model(abi,
            if os.path.exists("%s/%s" % (model_output_dir,
                                         formatted_name)):
                sh.rm("-rf", "%s/%s" % (model_output_dir, formatted_name))
-            adb_pull("%s/%s" % (phone_data_dir, formatted_name),
-                     model_output_dir, serialno)
+            device.pull_from_data_dir(formatted_name, model_output_dir)

    if platform == "tensorflow":
        validate(platform, model_file_path, "",
@@ -956,11 +743,10 @@ def validate_model(abi,
        container_name = "mace_caffe_validator"

        if caffe_env == common.CaffeEnvType.LOCAL:
-            import imp
            try:
-                imp.find_module('caffe')
+                import caffe
            except ImportError:
-                logger.error('There is no caffe python module.')
+                logging.error('There is no caffe python module.')
            validate(platform, model_file_path, weight_file_path,
                     "%s/%s" % (model_output_dir, input_file_name),
                     "%s/%s" % (model_output_dir, output_file_name),
@@ -1157,8 +943,8 @@ def benchmark_model(abi,
        if link_dynamic:
            adb_push(libmace_dynamic_library_path, phone_data_dir,
                     serialno)
-            push_depended_so_lib(libmace_dynamic_library_path, abi,
-                                 phone_data_dir, serialno)
+            push_depended_so_libs(libmace_dynamic_library_path, abi,
+                                  phone_data_dir, serialno)

        adb_push("%s/%s" % (benchmark_binary_dir, benchmark_binary_name),
                 phone_data_dir,