diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst
index 44d8b788653a1084b8278edcc6f67fd129489afb..3cae38d7df6930f6512aa4ac8f1b2d7e0934720a 100644
--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -114,69 +114,60 @@ Advanced usage
 --------------
 
 There are three common advanced use cases:
-  - run your model on the embedded device
+  - run your model on the embedded device(ARM LINUX)
   - converting model to C++ code.
   - tuning GPU kernels for a specific SoC.
 
-Run you model on the embedded device
-------------------
+Run you model on the embedded device(ARM Linux)
+-----------------------------------------------
 
-MACE use ssh to connect embedded device, in this case we recommend you to push ``$HOME/.ssh/id_rsa.pub``
-to your device ``$HOME/.ssh/authorized_keys``
+The way to run your model on the ARM Linux is nearly same as with android, except you need specify a device config file.
 
 .. code:: bash
 
-  cat ~/.ssh/id_rsa.pub | ssh -q {user}@{ip} "cat >> ~/.ssh/authorized_keys"
+    python tools/converter.py run --config=/path/to/your/model_deployment_file.yml --device_yml=/path/to/devices.yml
 
-This part will show you how to write your own device yaml config file.
+There are two steps to do before run:
 
-**Device yaml config file**
+1. configure login without password
 
-The way to run your model on the embedded device is nearly the same as run on android, except you need give a device yaml config file.
+    MACE use ssh to connect embedded device, you should copy your public key to embedded device with the blow command.
 
-MACE get this yaml config via ``--device_yml`` argument, default config value is ``devices.yml``
-, when the yaml config file is not found. we treat as there is no available arm linux device, give a message
-and continue on other device such as plugged android phone.
-
-* **Example**
+    .. code:: bash
 
-    Here is an device yaml config demo.
+      cat ~/.ssh/id_rsa.pub | ssh -q {user}@{ip} "cat >> ~/.ssh/authorized_keys"
 
-    .. literalinclude:: devices/demo_device_nanopi.yml
-        :language: yaml
+2. write your own device yaml configuration file.
 
-* **Configuration**
-
-.. list-table::
-    :header-rows: 1
+    * **Example**
 
-    * - Options
-      - Usage
-    * - target_abis
-      - Device supported abis, you can get it via ``dpkg --print-architecture`` and
-        ``dpkg --print-foreign-architectures`` command, if more than one abi is supported,
-        separate them by commas.
-    * - target_socs
-      - device soc, you can get it from device manual, we haven't found a way to get it in shell.
-    * - models
-      - device models full name, you can get via get ``lshw`` command (third party package, install it via your package manager).
-        see it's product value.
-    * - address
-      - Since we use ssh to connect device, ip address is required.
-    * - username
-      - login username, required.
-    * - password
-      - login password, optional when you can login into device without password
+        Here is an device yaml config demo.
 
+        .. literalinclude:: devices/demo_device_nanopi.yml
+            :language: yaml
 
-.. note::
+    * **Configuration**
+        The detailed explanation is listed in the blow table.
 
-    Some command tools:
+        .. list-table::
+            :header-rows: 1
 
-    .. code:: bash
+            * - Options
+              - Usage
+            * - target_abis
+              - Device supported abis, you can get it via ``dpkg --print-architecture`` and
+                ``dpkg --print-foreign-architectures`` command, if more than one abi is supported,
+                separate them by commas.
+            * - target_socs
+              - device soc, you can get it from device manual, we haven't found a way to get it in shell.
+            * - models
+              - device models full name, you can get via get ``lshw`` command (third party package, install it via your package manager).
+                see it's product value.
+            * - address
+              - Since we use ssh to connect device, ip address is required.
+            * - username
+              - login username, required.
 
-        # specify device yaml config file via --device_yml argument or put the file under working directory
-        python tools/converter.py run --config=/path/to/mace-models/mobilenet-v2/mobilenet-v2.yml --device_yml=/path/to/devices.yml
 
 Convert model(s) to C++ code
 --------------------------------
diff --git a/docs/user_guide/basic_usage.rst b/docs/user_guide/basic_usage.rst
index 63b8968b6f1579b6a7685fd0f32905372819f508..0ceef451864f7bbda8c996ffe5ccff2e77dbd650 100644
--- a/docs/user_guide/basic_usage.rst
+++ b/docs/user_guide/basic_usage.rst
@@ -246,13 +246,14 @@ to run and validate your model.
     	# Test model run time
         python tools/converter.py run --config=/path/to/your/model_deployment_file.yml --round=100
 
-        # If you want to run model on specified arm linux device, you should put device config file in the working directory or run with flag `--device_yml`
-        python tools/converter.py run --config=/path/to/mace-models/mobilenet-v2/mobilenet-v2.yml --device_yml=/path/to/devices.yml --example
-
         # Validate the correctness by comparing the results against the
     	# original model and framework, measured with cosine distance for similarity.
     	python tools/converter.py run --config=/path/to/your/model_deployment_file.yml --validate
 
+        # If you want to run model on specified arm linux device, you should put device config file in the working directory or run with flag `--device_yml`
+        python tools/converter.py run --config=/path/to/your/model_deployment_file.yml --device_yml=/path/to/devices.yml
+
+
 * **benchmark**
 
     benchmark and profile the model.
diff --git a/docs/user_guide/devices/demo_device_nanopi.yml b/docs/user_guide/devices/demo_device_nanopi.yml
index 567f7c7e1ce08af39134527d9eae825a688cb76f..a2558624a0f343191deb94f926770bcf29794a5a 100644
--- a/docs/user_guide/devices/demo_device_nanopi.yml
+++ b/docs/user_guide/devices/demo_device_nanopi.yml
@@ -12,12 +12,9 @@ devices:
     address: 10.0.0.0
   # login username
     username: user
-  # login password, is required when you can login into device without password
-    password: 1234567
   raspberry:
     target_abis: [armv7l]
     target_socs: BCM2837
     models: Raspberry Pi 3 Model B Plus Rev 1.3
     address: 10.0.0.1
     username: user
-    password: 123456
diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index ce50595412c7b24a148c02b7b261d20f344a9c72..67215b8dd430ff05d013f2d71c4e6fc7f1533d7e 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -42,7 +42,7 @@ struct CPUFreq {
 };
 
 namespace {
-#if defined(__ANDROID__)
+
 int GetCPUCount() {
   int cpu_count = 0;
   std::string cpu_sys_conf = "/proc/cpuinfo";
@@ -69,10 +69,8 @@ int GetCPUCount() {
   VLOG(2) << "CPU cores: " << cpu_count;
   return cpu_count;
 }
-#endif
 
 int GetCPUMaxFreq(std::vector<float> *max_freqs) {
-#if defined(__ANDROID__)
   int cpu_count = GetCPUCount();
   for (int cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
     std::string cpuinfo_max_freq_sys_conf = MakeString(
@@ -94,34 +92,6 @@ int GetCPUMaxFreq(std::vector<float> *max_freqs) {
     }
     f.close();
   }
-#else
-  std::string cpu_sys_conf = "/proc/cpuinfo";
-  std::ifstream f(cpu_sys_conf);
-  if (!f.is_open()) {
-    LOG(ERROR) << "failed to open " << cpu_sys_conf;
-    return -1;
-  }
-  std::string line;
-  const std::string freq_key = "cpu MHz";
-  while (std::getline(f, line)) {
-    if (line.size() >= freq_key.size()
-        && line.compare(0, freq_key.size(), freq_key) == 0) {
-      size_t pos = line.find(":");
-      if (pos != std::string::npos) {
-        std::string freq_str = line.substr(pos + 1);
-        float freq = atof(freq_str.c_str());
-        max_freqs->push_back(freq);
-      }
-    }
-  }
-  if (f.bad()) {
-    LOG(ERROR) << "failed to read " << cpu_sys_conf;
-  }
-  if (!f.eof()) {
-    LOG(ERROR) << "failed to read end of " << cpu_sys_conf;
-  }
-  f.close();
-#endif
 
   for (float freq : *max_freqs) {
     VLOG(2) << "CPU freq: " << freq;
diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py
deleted file mode 100644
index 5b644779470c0271689b2d341116f648a316fe86..0000000000000000000000000000000000000000
--- a/mace/python/tools/memory_optimizer.py
+++ /dev/null
@@ -1,350 +0,0 @@
-# Copyright 2018 Xiaomi, Inc.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import operator
-
-import six
-from six.moves import reduce
-
-from mace.proto import mace_pb2
-
-from mace.python.tools.converter_tool import base_converter as cvt
-from mace.python.tools.converter_tool.base_converter import DeviceType
-from mace.python.tools.converter_tool.base_converter import ConverterUtil
-from mace.python.tools.converter_tool.base_converter import MaceKeyword
-from mace.python.tools.convert_util import calculate_image_shape
-from mace.python.tools.convert_util import OpenCLBufferType
-
-
-def MemoryTypeToStr(mem_type):
-    if mem_type == mace_pb2.CPU_BUFFER:
-        return 'CPU_BUFFER'
-    elif mem_type == mace_pb2.GPU_BUFFER:
-        return 'GPU_BUFFER'
-    elif mem_type == mace_pb2.GPU_IMAGE:
-        return 'GPU_IMAGE'
-    else:
-        return 'UNKNOWN'
-
-
-class MemoryBlock(object):
-    def __init__(self, mem_type, block):
-        self._mem_type = mem_type
-        self._block = block
-
-    @property
-    def mem_type(self):
-        return self._mem_type
-
-    @property
-    def block(self):
-        return self._block
-
-
-class MemoryOptimizer(object):
-    def __init__(self, net_def):
-        self.net_def = net_def
-        self.idle_mem = set()
-        self.op_mem = {}  # op_name->mem_id
-        self.mem_block = {}  # mem_id->[size] or mem_id->[x, y]
-        self.total_mem_count = 0
-        self.input_ref_counter = {}
-        self.mem_ref_counter = {}
-        ocl_mem_type_arg = ConverterUtil.get_arg(
-            net_def, MaceKeyword.mace_opencl_mem_type)
-        self.cl_mem_type = ocl_mem_type_arg.i if ocl_mem_type_arg is not None \
-            else None
-
-        consumers = {}
-        for op in net_def.op:
-            if not self.op_need_optimize_memory(op):
-                continue
-            for ipt in op.input:
-                if ipt not in consumers:
-                    consumers[ipt] = []
-                consumers[ipt].append(op)
-        # only ref op's output tensor
-        for op in net_def.op:
-            if not self.op_need_optimize_memory(op):
-                continue
-            for output in op.output:
-                tensor_name = output
-                if tensor_name in consumers:
-                    self.input_ref_counter[tensor_name] = \
-                        len(consumers[tensor_name])
-                else:
-                    self.input_ref_counter[tensor_name] = 0
-
-    def op_need_optimize_memory(self, op):
-        return True
-
-    def get_op_mem_block(self, op_type, output_shape, output_type):
-        data_type_size = 4
-        if output_type == mace_pb2.DT_UINT8:
-            data_type_size = 1
-        return MemoryBlock(mace_pb2.CPU_BUFFER,
-                           [reduce(operator.mul, output_shape, 1) *
-                            data_type_size])
-
-    def mem_size(self, memory_block):
-        return memory_block.block[0]
-
-    def sub_mem_block(self, mem_block1, mem_block2):
-        return self.mem_size(mem_block1) - self.mem_size(mem_block2)
-
-    def resize_mem_block(self, old_mem_block, op_mem_block):
-        return MemoryBlock(
-            old_mem_block.mem_type,
-            [max(old_mem_block.block[0], op_mem_block.block[0])])
-
-    def add_net_mem_blocks(self):
-        for mem in self.mem_block:
-            arena = self.net_def.mem_arena
-            block = arena.mem_block.add()
-            block.mem_id = mem
-            block.device_type = DeviceType.CPU.value
-            block.mem_type = self.mem_block[mem].mem_type
-            block.x = self.mem_block[mem].block[0]
-            block.y = 1
-
-    def get_total_origin_mem_size(self):
-        origin_mem_size = 0
-        for op in self.net_def.op:
-            if not self.op_need_optimize_memory(op):
-                continue
-            origin_mem_size += reduce(operator.mul,
-                                      op.output_shape[0].dims,
-                                      1)
-        return origin_mem_size
-
-    def get_total_optimized_mem_size(self):
-        optimized_mem_size = 0
-        for mem in self.mem_block:
-            print(mem, MemoryTypeToStr(self.mem_block[mem].mem_type),
-                  self.mem_block[mem].block)
-            optimized_mem_size += self.mem_size(self.mem_block[mem])
-        return optimized_mem_size
-
-    @staticmethod
-    def is_memory_reuse_op(op):
-        return op.type == 'Reshape' or op.type == 'Identity' \
-               or op.type == 'Squeeze' or op.type == 'ExpandDims'
-
-    def optimize(self):
-        for op in self.net_def.op:
-            if not self.op_need_optimize_memory(op):
-                continue
-            if not op.output_shape:
-                six.print_("WARNING: There is no output shape information to "
-                           "do memory optimization. %s (%s)" %
-                           (op.name, op.type), file=sys.stderr)
-                return
-            if len(op.output_shape) != len(op.output):
-                six.print_('WARNING: the number of output shape is '
-                           'not equal to the number of output.',
-                           file=sys.stderr)
-                return
-            for i in range(len(op.output)):
-                if self.is_memory_reuse_op(op):
-                    # make these ops reuse memory of input tensor
-                    mem_id = self.op_mem.get(op.input[0], -1)
-                else:
-                    output_type = mace_pb2.DT_FLOAT
-                    for arg in op.arg:
-                        if arg.name == 'T':
-                            output_type = arg.i
-                    if len(op.output_type) > i:
-                        output_type = op.output_type[i]
-                    op_mem_block = self.get_op_mem_block(
-                        op.type,
-                        op.output_shape[i].dims,
-                        output_type)
-                    mem_id = -1
-                    if len(self.idle_mem) > 0:
-                        best_mem_add_size = six.MAXSIZE
-                        best_mem_waste_size = six.MAXSIZE
-                        for mid in self.idle_mem:
-                            old_mem_block = self.mem_block[mid]
-                            if old_mem_block.mem_type != op_mem_block.mem_type:
-                                continue
-                            new_mem_block = self.resize_mem_block(
-                                old_mem_block, op_mem_block)
-                            add_mem_size = self.sub_mem_block(new_mem_block,
-                                                              old_mem_block)
-                            waste_mem_size = self.sub_mem_block(new_mem_block,
-                                                                op_mem_block)
-
-                            # minimize add_mem_size; if best_mem_add_size is 0,
-                            # then minimize waste_mem_size
-                            if (best_mem_add_size > 0 and
-                                add_mem_size < best_mem_add_size) \
-                                    or (best_mem_add_size == 0 and
-                                        waste_mem_size < best_mem_waste_size):
-                                best_mem_id = mid
-                                best_mem_add_size = add_mem_size
-                                best_mem_waste_size = waste_mem_size
-                                best_mem_block = new_mem_block
-
-                        # if add mem size < op mem size, then reuse it
-                        if best_mem_add_size <= self.mem_size(op_mem_block):
-                            self.mem_block[best_mem_id] = best_mem_block
-                            mem_id = best_mem_id
-                            self.idle_mem.remove(mem_id)
-
-                    if mem_id == -1:
-                        mem_id = self.total_mem_count
-                        self.total_mem_count += 1
-                        self.mem_block[mem_id] = op_mem_block
-
-                if mem_id != -1:
-                    op.mem_id.extend([mem_id])
-                    self.op_mem[op.output[i]] = mem_id
-                    if mem_id not in self.mem_ref_counter:
-                        self.mem_ref_counter[mem_id] = 1
-                    else:
-                        self.mem_ref_counter[mem_id] += 1
-
-            # de-ref input tensor mem
-            for idx in six.moves.range(len(op.input)):
-                ipt = op.input[idx]
-                if ipt in self.input_ref_counter:
-                    self.input_ref_counter[ipt] -= 1
-                    if self.input_ref_counter[ipt] == 0 \
-                            and ipt in self.op_mem:
-                        mem_id = self.op_mem[ipt]
-                        self.mem_ref_counter[mem_id] -= 1
-                        if self.mem_ref_counter[mem_id] == 0:
-                            self.idle_mem.add(self.op_mem[ipt])
-                    elif self.input_ref_counter[ipt] < 0:
-                        raise Exception('ref count is less than 0')
-
-        self.add_net_mem_blocks()
-
-        print("total op: %d" % len(self.net_def.op))
-        print("origin mem: %d, optimized mem: %d" % (
-            self.get_total_origin_mem_size(),
-            self.get_total_optimized_mem_size()))
-
-
-class GPUMemoryOptimizer(MemoryOptimizer):
-    def op_need_optimize_memory(self, op):
-        if op.type == MaceKeyword.mace_buffer_transform:
-            for arg in op.arg:
-                if arg.name == 'mode' and arg.i == 0:
-                    return False
-        return op.type != MaceKeyword.mace_buffer_inverse_transform
-
-    def get_op_image_mem_block(self, op_type, output_shape):
-        if op_type == 'WinogradTransform' or op_type == 'MatMul':
-            buffer_shape = list(output_shape) + [1]
-            mem_block = MemoryBlock(
-                mace_pb2.GPU_IMAGE,
-                calculate_image_shape(OpenCLBufferType.IN_OUT_HEIGHT,
-                                      buffer_shape))
-        elif op_type in ['Shape',
-                         'InferConv2dShape',
-                         'StridedSlice',
-                         'Stack',
-                         'ScalarMath']:
-            if len(output_shape) == 1:
-                mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
-                                        [output_shape[0], 1])
-            elif len(output_shape) == 0:
-                mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
-                                        [1, 1])
-            else:
-                raise Exception('%s output shape dim size is not 0 or 1.' %
-                                op_type)
-        else:
-            if len(output_shape) == 2:  # only support fc/softmax
-                buffer_shape = [output_shape[0], output_shape[1]]
-            elif len(output_shape) == 4:
-                buffer_shape = output_shape
-            else:
-                raise Exception('%s output shape dim size is not 2 or 4.' %
-                                op_type)
-            mem_block = MemoryBlock(
-                mace_pb2.GPU_IMAGE,
-                calculate_image_shape(OpenCLBufferType.IN_OUT_CHANNEL,
-                                      buffer_shape))
-        return mem_block
-
-    def get_op_buffer_mem_block(self, output_shape):
-        return MemoryBlock(mace_pb2.GPU_BUFFER,
-                           [reduce(operator.mul, output_shape, 1), 1])
-
-    def get_op_mem_block(self, op_type, output_shape, output_type):
-        if self.cl_mem_type == mace_pb2.GPU_IMAGE:
-            return self.get_op_image_mem_block(op_type, output_shape)
-        else:
-            return self.get_op_buffer_mem_block(output_shape)
-
-    def mem_size(self, memory_block):
-        if memory_block.mem_type == mace_pb2.GPU_IMAGE:
-            return memory_block.block[0] * memory_block.block[1] * 4
-        else:
-            return memory_block.block[0]
-
-    def resize_mem_block(self, old_mem_block, op_mem_block):
-        resize_mem_block = MemoryBlock(
-            old_mem_block.mem_type,
-            [
-                max(old_mem_block.block[0], op_mem_block.block[0]),
-                max(old_mem_block.block[1], op_mem_block.block[1])
-            ])
-
-        return resize_mem_block
-
-    def add_net_mem_blocks(self):
-        max_image_size_x = 0
-        max_image_size_y = 0
-        for mem in self.mem_block:
-            arena = self.net_def.mem_arena
-            block = arena.mem_block.add()
-            block.mem_id = mem
-            block.device_type = DeviceType.GPU.value
-            block.mem_type = self.mem_block[mem].mem_type
-            block.x = self.mem_block[mem].block[0]
-            block.y = self.mem_block[mem].block[1]
-            if self.mem_block[mem].mem_type == mace_pb2.GPU_IMAGE:
-                max_image_size_x = max(max_image_size_x, block.x)
-                max_image_size_y = max(max_image_size_y, block.y)
-
-        if self.cl_mem_type == mace_pb2.GPU_IMAGE:
-            # Update OpenCL max image size
-            net_ocl_max_img_size_arg = None
-            for arg in self.net_def.arg:
-                if arg.name == cvt.MaceKeyword.mace_opencl_max_image_size:
-                    net_ocl_max_img_size_arg = arg
-                    max_image_size_x = max(arg.ints[0], max_image_size_x)
-                    max_image_size_y = max(arg.ints[1], max_image_size_y)
-                    break
-            if net_ocl_max_img_size_arg is None:
-                net_ocl_max_img_size_arg = self.net_def.arg.add()
-                net_ocl_max_img_size_arg.name = \
-                    cvt.MaceKeyword.mace_opencl_max_image_size
-
-            net_ocl_max_img_size_arg.ints[:] = [max_image_size_x,
-                                                max_image_size_y]
-
-
-def optimize_gpu_memory(net_def):
-    mem_optimizer = GPUMemoryOptimizer(net_def)
-    mem_optimizer.optimize()
-
-
-def optimize_cpu_memory(net_def):
-    mem_optimizer = MemoryOptimizer(net_def)
-    mem_optimizer.optimize()
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 15273b311c6c9aa2a50401cd135e2e2622ad9280..93442944babbe618cbab78171b4b197a14e9805b 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -1,9 +1,7 @@
 # Partially borrowed from tensorflow tools/bazel.rc
 
 # By default, we don't distinct target and host platfroms.
-# When doing cross compilation, use --config=cross_compile to distinct them.
 build --distinct_host_configuration=false
-build:cross_compile --distinct_host_configuration=true
 
 build --verbose_failures
 build --copt=-std=c++11
@@ -17,12 +15,12 @@ build --copt=-DMACE_USE_NNLIB_CAF
 build:symbol_hidden --copt=-fvisibility=hidden
 
 # Usage example: bazel build --config android
-build:android --config=cross_compile
+build:android --distinct_host_configuration=true
 build:android --crosstool_top=//external:android/crosstool
 build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
 
 # Usage example: bazel build --config arm_linux_gnueabihf
-build:arm_linux_gnueabihf --config=cross_compile
+build:arm_linux_gnueabihf --distinct_host_configuration=true
 build:arm_linux_gnueabihf --crosstool_top=//tools/arm_compiler:toolchain
 build:arm_linux_gnueabihf --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
 build:arm_linux_gnueabihf --cpu=armeabi-v7a
@@ -34,7 +32,7 @@ build:arm_linux_gnueabihf --copt -Wno-sequence-point
 build:arm_linux_gnueabihf --copt -Wno-implicit-fallthrough
 
 # Usage example: bazel build --config aarch64_linux_gnu
-build:aarch64_linux_gnu --config=cross_compile
+build:aarch64_linux_gnu --distinct_host_configuration=true
 build:aarch64_linux_gnu --crosstool_top=//tools/aarch64_compiler:toolchain
 build:aarch64_linux_gnu --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
 build:aarch64_linux_gnu --cpu=aarch64
diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py
index 6906015c4e4bb5d12a8856e14197ee1a6cac4d0e..5e9086046e37be9f863a1f50513eb659852678d2 100644
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -52,13 +52,13 @@ def ops_benchmark_stdout_processor(stdout, dev, abi):
             metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
             metrics["%s.gmacc_per_sec" % parts[0]] = parts[4]
 
-    platform = dev[YAMLKeyword.target_socs]
-    model = dev[YAMLKeyword.models]
-    tags = {
-        "ro.board.platform": platform,
-        "ro.product.model": model,
-        "abi": abi
-    }
+    # platform = dev[YAMLKeyword.target_socs]
+    # model = dev[YAMLKeyword.device_name]
+    # tags = {
+    #     "ro.board.platform": platform,
+    #     "ro.product.model": model,
+    #     "abi": abi
+    # }
     # sh_commands.falcon_push_metrics(server,
     #    metrics, tags=tags, endpoint="mace_ops_benchmark")
 
@@ -99,7 +99,7 @@ def parse_args():
     parser.add_argument(
         "--stdout_processor",
         type=str,
-        default="stdout_processor",
+        default="unittest_stdout_processor",
         help="Stdout processing function, default: stdout_processor")
     parser.add_argument(
         "--enable_neon",
diff --git a/tools/build-standalone-lib.sh b/tools/build-standalone-lib.sh
index 24cba4cf06d3f342498093e885cf771ebcb48227..fcb5c288239d64781b7d803272c07a0245eeac4b 100755
--- a/tools/build-standalone-lib.sh
+++ b/tools/build-standalone-lib.sh
@@ -45,11 +45,11 @@ bazel build --config android --config optimization mace/libmace:libmace_dynamic
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu/
 
 echo "build shared lib for arm_linux_gnueabihf + cpu_gpu"
-bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true
+bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_dynamic --define neon=true --define openmp=true --define opencl=true --define quantize=true
 cp bazel-bin/mace/libmace/libmace.so  $LIB_DIR/arm_linux_gnueabihf/cpu_gpu/
 
 echo "build shared lib for aarch64_linux_gnu + cpu_gpu"
-bazel build --config aarch64_linux_gnu  --config optimization mace/libmace:libmace_dynamic  --define neon=true --define openmp=true --define opencl=true
+bazel build --config aarch64_linux_gnu  --config optimization mace/libmace:libmace_dynamic  --define neon=true --define openmp=true --define opencl=true --define quantize=true
 cp bazel-bin/mace/libmace/libmace.so  $LIB_DIR/aarch64_linux_gnu/cpu_gpu/
 
 if [[ "$OSTYPE" != "darwin"* ]];then
@@ -73,11 +73,11 @@ bazel build --config android --config optimization mace/libmace:libmace_static -
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu/
 
 echo "build static lib for arm_linux_gnueabihf + cpu_gpu"
-bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true
+bazel build --config arm_linux_gnueabihf --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm_linux_gnueabihf/cpu_gpu/
 
 echo "build static lib for aarch64_linux_gnu + cpu_gpu"
-bazel build --config aarch64_linux_gnu --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true
+bazel build --config aarch64_linux_gnu --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define openmp=true --define opencl=true --define quantize=true
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/aarch64_linux_gnu/cpu_gpu/
 
 if [[ "$OSTYPE" != "darwin"* ]];then
diff --git a/tools/common.py b/tools/common.py
index 2185b27476f3ea79c610a8c19e1b90684b2cdaac..2e197d8fb634be21a1e8c2320be0ac66949be165 100644
--- a/tools/common.py
+++ b/tools/common.py
@@ -240,7 +240,7 @@ def get_model_files(model_file_path,
 
 def get_opencl_binary_output_path(library_name, target_abi, device):
     target_soc = device.target_socs
-    device_model = device.models
+    device_name = device.device_name
     return '%s/%s/%s/%s/%s_%s.%s.%s.bin' % \
            (BUILD_OUTPUT_DIR,
             library_name,
@@ -248,13 +248,13 @@ def get_opencl_binary_output_path(library_name, target_abi, device):
             target_abi,
             library_name,
             OUTPUT_OPENCL_BINARY_FILE_NAME,
-            device_model,
+            device_name,
             target_soc)
 
 
 def get_opencl_parameter_output_path(library_name, target_abi, device):
     target_soc = device.target_socs
-    device_model = device.models
+    device_name = device.device_name
     return '%s/%s/%s/%s/%s_%s.%s.%s.bin' % \
            (BUILD_OUTPUT_DIR,
             library_name,
@@ -262,7 +262,7 @@ def get_opencl_parameter_output_path(library_name, target_abi, device):
             target_abi,
             library_name,
             OUTPUT_OPENCL_PARAMETER_FILE_NAME,
-            device_model,
+            device_name,
             target_soc)
 
 
@@ -271,7 +271,7 @@ def get_build_model_dirs(library_name,
                          target_abi,
                          device,
                          model_file_path):
-    models = device.models
+    device_name = device.device_name
     target_socs = device.target_socs
     model_path_digest = md5sum(model_file_path)
     model_output_base_dir = '{}/{}/{}/{}/{}'.format(
@@ -287,7 +287,7 @@ def get_build_model_dirs(library_name,
     else:
         model_output_dir = '{}/{}_{}/{}'.format(
             model_output_base_dir,
-            models,
+            device_name,
             target_socs,
             target_abi
         )
diff --git a/tools/converter.py b/tools/converter.py
index 7646692eec5ab2358459d322c5d7b5fe44af1eab..cc1377161f4c5014fefd02114d0fa6bc493aef9d 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -111,6 +111,13 @@ class DefaultValues(object):
     gpu_priority_hint = 3,
 
 
+class ValidationThreshold(object):
+    cpu_threshold = 0.999,
+    gpu_threshold = 0.995,
+    hexagon_threshold = 0.930,
+    cpu_quantize_threshold = 0.980,
+
+
 CPP_KEYWORDS = [
     'alignas', 'alignof', 'and', 'and_eq', 'asm', 'atomic_cancel',
     'atomic_commit', 'atomic_noexcept', 'auto', 'bitand', 'bitor',
@@ -435,10 +442,11 @@ def format_model_config(flags):
                     'similarity threshold must be a dict.')
 
             threshold_dict = {
-                DeviceType.CPU: 0.999,
-                DeviceType.GPU: 0.995,
-                DeviceType.HEXAGON: 0.930,
-                DeviceType.CPU + "_QUANTIZE": 0.980,
+                DeviceType.CPU: ValidationThreshold.cpu_threshold,
+                DeviceType.GPU: ValidationThreshold.gpu_threshold,
+                DeviceType.HEXAGON: ValidationThreshold.hexagon_threshold,
+                DeviceType.CPU + "_QUANTIZE":
+                    ValidationThreshold.cpu_quantize_threshold,
             }
             for k, v in six.iteritems(validation_threshold):
                 if k.upper() == 'DSP':
@@ -838,39 +846,6 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp,
                                        mace_lib_type == MACELibType.dynamic)
 
 
-def build_quantize_stat(configs):
-    library_name = configs[YAMLKeyword.library_name]
-
-    build_tmp_binary_dir = get_build_binary_dir(library_name, ABIType.host)
-    if os.path.exists(build_tmp_binary_dir):
-        sh.rm("-rf", build_tmp_binary_dir)
-    os.makedirs(build_tmp_binary_dir)
-
-    quantize_stat_target = QUANTIZE_STAT_TARGET
-    build_arg = ""
-    six.print_(configs[YAMLKeyword.model_graph_format])
-    if configs[YAMLKeyword.model_graph_format] == ModelFormat.code:
-        mace_check(os.path.exists(ENGINE_CODEGEN_DIR),
-                   ModuleName.RUN,
-                   "You should convert model first.")
-        build_arg = "--per_file_copt=mace/tools/quantization/quantize_stat.cc@-DMODEL_GRAPH_FORMAT_CODE"  # noqa
-
-    sh_commands.bazel_build(
-        quantize_stat_target,
-        abi=ABIType.host,
-        toolchain=flags.toolchain,
-        enable_openmp=True,
-        symbol_hidden=True,
-        extra_args=build_arg
-    )
-
-    quantize_stat_filepath = build_tmp_binary_dir + "/quantize_stat"
-    if os.path.exists(quantize_stat_filepath):
-        sh.rm("-rf", quantize_stat_filepath)
-    sh.cp("-f", "bazel-bin/mace/tools/quantization/quantize_stat",
-          build_tmp_binary_dir)
-
-
 def build_example(configs, target_abi, toolchain,
                   enable_openmp, mace_lib_type):
     library_name = configs[YAMLKeyword.library_name]
@@ -951,10 +926,8 @@ def run_mace(flags):
     clear_build_dirs(configs[YAMLKeyword.library_name])
 
     target_socs = configs[YAMLKeyword.target_socs]
-    if not target_socs or ALL_SOC_TAG in target_socs:
-        device_list = DeviceManager.list_devices(flags.device_yml)
-    else:
-        device_list = DeviceManager.list_devices(flags.device_yml)
+    device_list = DeviceManager.list_devices(flags.device_yml)
+    if target_socs and ALL_SOC_TAG not in target_socs:
         device_list = [dev for dev in device_list
                        if dev[YAMLKeyword.target_socs].lower() in target_socs]
     for target_abi in configs[YAMLKeyword.target_abis]:
@@ -1042,13 +1015,10 @@ def benchmark_model(flags):
     clear_build_dirs(configs[YAMLKeyword.library_name])
 
     target_socs = configs[YAMLKeyword.target_socs]
-    if not target_socs or ALL_SOC_TAG in target_socs:
-        device_list = DeviceManager.list_devices(flags.device_yml)
-        # target_socs = sh_commands.adb_get_all_socs()
-    else:
-        device_list = DeviceManager.list_devices(flags.device_yml)
+    device_list = DeviceManager.list_devices(flags.device_yml)
+    if target_socs and ALL_SOC_TAG not in target_socs:
         device_list = [dev for dev in device_list
-                       if dev[YAMLKeyword.target_socs] in target_socs]
+                       if dev[YAMLKeyword.target_socs].lower() in target_socs]
 
     for target_abi in configs[YAMLKeyword.target_abis]:
         # build benchmark_model binary
diff --git a/tools/device.py b/tools/device.py
index d04cfa642203359147e4aa7797c6615358fabd0d..655d90d012e65a805c2655dc0fb413ae2ed9b8df 100644
--- a/tools/device.py
+++ b/tools/device.py
@@ -37,8 +37,8 @@ class DeviceWrapper:
         :type device_dict: Device
         :param device_dict: a key-value dict that holds the device information,
                        which attribute has:
-                       target_abis, target_socs, models, system, address
-                       password, username
+                       device_name, target_abis, target_socs, system,
+                        address, username
         """
         diff = set(device_dict.keys()) - set(YAMLKeyword.__dict__.keys())
         if len(diff) > 0:
@@ -111,6 +111,7 @@ class DeviceWrapper:
     def push(self, src_path, dst_path):
         mace_check(os.path.exists(src_path), "Device",
                    '{} not found'.format(src_path))
+        six.print_("Push %s to %s" % (src_path, dst_path))
         if self.system == SystemType.android:
             sh_commands.adb_push(src_path, dst_path, self.address)
         elif self.system == SystemType.arm_linux:
@@ -129,6 +130,7 @@ class DeviceWrapper:
         dst_file = "%s/%s" % (dst_path, file_name)
         if os.path.exists(dst_file):
             sh.rm('-f', dst_file)
+        six.print_("Pull %s to %s" % (src_path, dst_path))
         if self.system == SystemType.android:
             sh_commands.adb_pull(
                 src_file, dst_file, self.address)
@@ -138,7 +140,6 @@ class DeviceWrapper:
                                            self.address,
                                            src_file),
                        dst_file)
-                print("pull file ", src_path, dst_path)
             except sh.ErrorReturnCode_1 as e:
                 six.print_("Pull Failed !", file=sys.stderr)
                 raise e
@@ -256,10 +257,13 @@ class DeviceWrapper:
             if model_graph_format == ModelFormat.file:
                 mace_model_phone_path = "%s/%s.pb" % (self.data_dir,
                                                       model_tag)
-                self.push(mace_model_path,
-                          mace_model_phone_path)
+                self.push(mace_model_path, mace_model_phone_path)
             if link_dynamic:
                 self.push(libmace_dynamic_library_path, self.data_dir)
+                if self.system == SystemType.android:
+                    sh_commands.push_depended_so_libs(
+                        libmace_dynamic_library_path, abi, self.data_dir,
+                        self.address)
             self.push("%s/%s" % (target_dir, target_name), self.data_dir)
 
             stdout_buff = []
@@ -430,14 +434,11 @@ class DeviceWrapper:
                                   configs[YAMLKeyword.model_graph_format],
                                   configs[YAMLKeyword.model_data_format],
                                   target_abi)
-            if target_abi == ABIType.host:
-                device_model = ABIType.host
-            else:
-                device_model = self.models
+            if target_abi != ABIType.host:
                 self.clear_data_dir()
             MaceLogger.header(
                 StringFormatter.block(
-                    'Run model {} on {}'.format(model_name, device_model)))
+                    'Run model {} on {}'.format(model_name, self.device_name)))
 
             model_config = configs[YAMLKeyword.models][model_name]
             model_runtime = model_config[YAMLKeyword.runtime]
@@ -631,7 +632,7 @@ class DeviceWrapper:
         data_str = '{model_name},{device_name},{soc},{abi},{device_type},' \
                    '{init},{warmup},{run_avg},{tuned}\n'.format(
                     model_name=model_name,
-                    device_name=self.models,
+                    device_name=self.device_name,
                     soc=self.target_socs,
                     abi=target_abi,
                     device_type=device_type,
@@ -671,7 +672,7 @@ class DeviceWrapper:
         mace_model_path = ''
         if model_graph_format == ModelFormat.file:
             mace_model_path = '%s/%s.pb' % (mace_model_dir, model_tag)
-        if abi == 'host':
+        if abi == ABIType.host:
             libmace_dynamic_lib_dir_path = \
                 os.path.dirname(libmace_dynamic_library_path)
             p = subprocess.Popen(
@@ -719,6 +720,10 @@ class DeviceWrapper:
                 self.push(mace_model_path, mace_model_device_path)
             if link_dynamic:
                 self.push(libmace_dynamic_library_path, self.data_dir)
+                if self.system == SystemType.android:
+                    sh_commands.push_depended_so_libs(
+                        libmace_dynamic_library_path, abi, self.data_dir,
+                        self.address)
             self.rm('%s/%s' % (self.data_dir, benchmark_binary_name))
             self.push('%s/%s' % (benchmark_binary_dir, benchmark_binary_name),
                       self.data_dir)
@@ -761,19 +766,11 @@ class DeviceWrapper:
             os.remove(tmp_cmd_file)
 
             if self.system == SystemType.android:
-                sh.adb(
-                    '-s',
-                    self.address,
-                    'shell',
-                    'sh',
-                    cmd_file_path,
-                    _fg=True
-                )
+                sh.adb('-s', self.address, 'shell', 'sh', cmd_file_path,
+                       _fg=True)
             elif self.system == SystemType.arm_linux:
                 sh.ssh('%s@%s' % (self.username, self.address),
-                       'sh',
-                       cmd_file_path,
-                       _fg=True)
+                       'sh', cmd_file_path, _fg=True)
             self.rm(cmd_file_path)
             six.print_('Benchmark done! \n')
 
@@ -804,13 +801,10 @@ class DeviceWrapper:
                                   configs[YAMLKeyword.model_graph_format],
                                   configs[YAMLKeyword.model_data_format],
                                   target_abi)
-            if target_abi == ABIType.host:
-                device_name = ABIType.host
-            else:
-                device_name = self.models
             MaceLogger.header(
                 StringFormatter.block(
-                    'Benchmark model %s on %s' % (model_name, device_name)))
+                    'Benchmark model %s on %s' % (model_name,
+                                                  self.device_name)))
             model_config = configs[YAMLKeyword.models][model_name]
             model_runtime = model_config[YAMLKeyword.runtime]
             subgraphs = model_config[YAMLKeyword.subgraphs]
@@ -885,7 +879,7 @@ class DeviceWrapper:
         print('Trying to lock device %s' % self.address)
         with self.lock():
             print('Run on device: %s, %s, %s' %
-                  (self.address, self.target_socs, self.models))
+                  (self.address, self.target_socs, self.device_name))
             self.rm(self.data_dir)
             self.exec_command('mkdir -p %s' % self.data_dir)
             self.push(host_bin_full_path, device_bin_full_path)
@@ -949,11 +943,11 @@ class DeviceManager:
         for adb in adb_list:
             prop = sh_commands.adb_getprop_by_serialno(adb[0])
             android = {
-                YAMLKeyword.device_name: adb[1],
+                YAMLKeyword.device_name:
+                    prop['ro.product.model'].replace(' ', ''),
                 YAMLKeyword.target_abis:
                     prop['ro.product.cpu.abilist'].split(','),
                 YAMLKeyword.target_socs: prop['ro.board.platform'],
-                YAMLKeyword.models: prop['ro.product.model'].replace(' ', '_'),
                 YAMLKeyword.system: SystemType.android,
                 YAMLKeyword.address: adb[0],
                 YAMLKeyword.username: '',
@@ -968,9 +962,9 @@ class DeviceManager:
         devices = devices['devices']
         device_list = []
         for name, dev in six.iteritems(devices):
-            dev[YAMLKeyword.device_name] = name
+            dev[YAMLKeyword.device_name] = \
+                dev[YAMLKeyword.models].replace(' ', '')
             dev[YAMLKeyword.system] = SystemType.arm_linux
-            dev[YAMLKeyword.models] = dev[YAMLKeyword.models].replace(' ', '_')
             device_list.append(dev)
         return device_list
 
@@ -992,7 +986,6 @@ class DeviceManager:
             YAMLKeyword.target_abis: [ABIType.host],
             YAMLKeyword.target_socs: '',
             YAMLKeyword.system: SystemType.host,
-            YAMLKeyword.models: None,
             YAMLKeyword.address: None,
 
         }
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 38f2b60a3dbd40ef8c37c37ae19bb9fb42b23602..da8e96054d5e4e7ea24b8e95dee58a511372c23d 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -20,7 +20,6 @@ import os
 import re
 import sh
 import struct
-import subprocess
 import sys
 import time
 import platform
@@ -28,10 +27,6 @@ import platform
 import six
 
 import common
-from common import ModelFormat
-from common import ABIType
-from common import SystemType
-from common import YAMLKeyword
 from common import abi_to_internal
 
 sys.path.insert(0, "mace/python/tools")
@@ -179,99 +174,16 @@ def adb_get_all_socs():
 
 
 def adb_push(src_path, dst_path, serialno):
-    six.print_("Push %s to %s" % (src_path, dst_path))
     sh.adb("-s", serialno, "push", src_path, dst_path)
 
 
 def adb_pull(src_path, dst_path, serialno):
-    six.print_("Pull %s to %s" % (src_path, dst_path))
     try:
         sh.adb("-s", serialno, "pull", src_path, dst_path)
     except Exception as e:
         six.print_("Error msg: %s" % e, file=sys.stderr)
 
 
-def adb_run(abi,
-            serialno,
-            host_bin_path,
-            bin_name,
-            args="",
-            opencl_profiling=True,
-            vlog_level=0,
-            device_bin_path="/data/local/tmp/mace",
-            out_of_range_check=True,
-            address_sanitizer=False,
-            simpleperf=False):
-    host_bin_full_path = "%s/%s" % (host_bin_path, bin_name)
-    device_bin_full_path = "%s/%s" % (device_bin_path, bin_name)
-    props = adb_getprop_by_serialno(serialno)
-    six.print_(
-        "====================================================================="
-    )
-    six.print_("Trying to lock device %s" % serialno)
-    with device_lock(serialno):
-        six.print_("Run on device: %s, %s, %s" %
-                   (serialno, props["ro.board.platform"],
-                    props["ro.product.model"]))
-        sh.adb("-s", serialno, "shell", "rm -rf %s" % device_bin_path)
-        sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path)
-        adb_push(host_bin_full_path, device_bin_full_path, serialno)
-        ld_preload = ""
-        if address_sanitizer:
-            adb_push(find_asan_rt_library(abi), device_bin_path, serialno)
-            ld_preload = "LD_PRELOAD=%s/%s" % (device_bin_path,
-                                               asan_rt_library_names(abi)),
-
-        opencl_profiling = 1 if opencl_profiling else 0
-        out_of_range_check = 1 if out_of_range_check else 0
-        six.print_("Run %s" % device_bin_full_path)
-
-        stdout_buff = []
-        process_output = make_output_processor(stdout_buff)
-
-        if simpleperf:
-            adb_push(find_simpleperf_library(abi), device_bin_path, serialno)
-            simpleperf_cmd = "%s/simpleperf" % device_bin_path
-            sh.adb(
-                "-s",
-                serialno,
-                "shell",
-                ld_preload,
-                "MACE_OUT_OF_RANGE_CHECK=%d" % out_of_range_check,
-                "MACE_OPENCL_PROFILING=%d" % opencl_profiling,
-                "MACE_CPP_MIN_VLOG_LEVEL=%d" % vlog_level,
-                simpleperf_cmd,
-                "stat",
-                "--group",
-                "raw-l1-dcache,raw-l1-dcache-refill",
-                "--group",
-                "raw-l2-dcache,raw-l2-dcache-refill",
-                "--group",
-                "raw-l1-dtlb,raw-l1-dtlb-refill",
-                "--group",
-                "raw-l2-dtlb,raw-l2-dtlb-refill",
-                device_bin_full_path,
-                args,
-                _tty_in=True,
-                _out=process_output,
-                _err_to_out=True)
-        else:
-            sh.adb(
-                "-s",
-                serialno,
-                "shell",
-                ld_preload,
-                "MACE_OUT_OF_RANGE_CHECK=%d" % out_of_range_check,
-                "MACE_OPENCL_PROFILING=%d" % opencl_profiling,
-                "MACE_CPP_MIN_VLOG_LEVEL=%d" % vlog_level,
-                device_bin_full_path,
-                args,
-                _tty_in=True,
-                _out=process_output,
-                _err_to_out=True)
-        return "".join(stdout_buff)
-
-
 ################################
 # Toolchain
 ################################
@@ -433,15 +345,6 @@ def gen_mace_engine_factory_source(model_tags,
     six.print_("Generate mace engine creator source done!\n")
 
 
-def pull_file_from_device(serial_num, file_path, file_name, output_dir):
-    if not os.path.exists(output_dir):
-        sh.mkdir("-p", output_dir)
-    output_path = "%s/%s" % (output_dir, file_path)
-    if os.path.exists(output_path):
-        sh.rm('-rf', output_path)
-    adb_pull(file_path + '/' + file_name, output_dir, serial_num)
-
-
 def merge_opencl_binaries(binaries_dirs,
                           cl_compiled_program_file_name,
                           output_file_path):
@@ -690,19 +593,17 @@ def push_depended_so_libs(libmace_dynamic_library_path,
                           abi, phone_data_dir, serialno):
     dep_so_libs = sh.bash(os.environ["ANDROID_NDK_HOME"] + "/ndk-depends",
                           libmace_dynamic_library_path)
+    src_file = ""
     for dep in split_stdout(dep_so_libs):
         if dep == "libgnustl_shared.so":
-            adb_push(
-                "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/%s/libgnustl_shared.so"  # noqa
-                % (os.environ["ANDROID_NDK_HOME"], abi),
-                phone_data_dir,
-                serialno)
+            src_file = "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/" \
+                "%s/libgnustl_shared.so"\
+                       % (os.environ["ANDROID_NDK_HOME"], abi)
         elif dep == "libc++_shared.so":
-            adb_push(
-                "%s/sources/cxx-stl/llvm-libc++/libs/%s/libc++_shared.so"  # noqa
-                % (os.environ["ANDROID_NDK_HOME"], abi),
-                phone_data_dir,
-                serialno)
+            src_file = "%s/sources/cxx-stl/llvm-libc++/libs/" \
+                 "%s/libc++_shared.so" % (os.environ["ANDROID_NDK_HOME"], abi)
+    print("push %s to %s" % (src_file, phone_data_dir))
+    adb_push(src_file, phone_data_dir, serialno)
 
 
 def validate_model(abi,
@@ -861,149 +762,6 @@ def packaging_lib(libmace_output_dir, project_name):
 ################################
 # benchmark
 ################################
-def benchmark_model(abi,
-                    serialno,
-                    benchmark_binary_dir,
-                    benchmark_binary_name,
-                    vlog_level,
-                    embed_model_data,
-                    model_output_dir,
-                    mace_model_dir,
-                    input_nodes,
-                    output_nodes,
-                    input_shapes,
-                    output_shapes,
-                    model_tag,
-                    device_type,
-                    phone_data_dir,
-                    model_graph_format,
-                    opencl_binary_file,
-                    opencl_parameter_file,
-                    libmace_dynamic_library_path,
-                    omp_num_threads=-1,
-                    cpu_affinity_policy=1,
-                    gpu_perf_hint=3,
-                    gpu_priority_hint=3,
-                    input_file_name="model_input",
-                    link_dynamic=False):
-    six.print_("* Benchmark for %s" % model_tag)
-
-    mace_model_path = ""
-    if model_graph_format == ModelFormat.file:
-        mace_model_path = "%s/%s.pb" % (mace_model_dir, model_tag)
-    if abi == "host":
-        libmace_dynamic_lib_dir_path = \
-            os.path.dirname(libmace_dynamic_library_path)
-        p = subprocess.Popen(
-            [
-                "env",
-                "LD_LIBRARY_PATH=%s" % libmace_dynamic_lib_dir_path,
-                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
-                "%s/%s" % (benchmark_binary_dir, benchmark_binary_name),
-                "--model_name=%s" % model_tag,
-                "--input_node=%s" % ",".join(input_nodes),
-                "--output_node=%s" % ",".join(output_nodes),
-                "--input_shape=%s" % ":".join(input_shapes),
-                "--output_shape=%s" % ":".join(output_shapes),
-                "--input_file=%s/%s" % (model_output_dir, input_file_name),
-                "--model_data_file=%s/%s.data" % (mace_model_dir, model_tag),
-                "--device=%s" % device_type,
-                "--omp_num_threads=%s" % omp_num_threads,
-                "--cpu_affinity_policy=%s" % cpu_affinity_policy,
-                "--gpu_perf_hint=%s" % gpu_perf_hint,
-                "--gpu_priority_hint=%s" % gpu_priority_hint,
-                "--model_file=%s" % mace_model_path,
-            ])
-        p.wait()
-    else:
-        sh.adb("-s", serialno, "shell", "mkdir", "-p", phone_data_dir)
-        internal_storage_dir = create_internal_storage_dir(
-            serialno, phone_data_dir)
-
-        for input_name in input_nodes:
-            formatted_name = common.formatted_file_name(input_file_name,
-                                                        input_name)
-            adb_push("%s/%s" % (model_output_dir, formatted_name),
-                     phone_data_dir, serialno)
-        if not embed_model_data:
-            adb_push("%s/%s.data" % (mace_model_dir, model_tag),
-                     phone_data_dir, serialno)
-        if device_type == common.DeviceType.GPU:
-            if os.path.exists(opencl_binary_file):
-                adb_push(opencl_binary_file, phone_data_dir, serialno)
-            if os.path.exists(opencl_parameter_file):
-                adb_push(opencl_parameter_file, phone_data_dir, serialno)
-        mace_model_phone_path = ""
-        if model_graph_format == ModelFormat.file:
-            mace_model_phone_path = "%s/%s.pb" % (phone_data_dir, model_tag)
-            adb_push(mace_model_path,
-                     mace_model_phone_path,
-                     serialno)
-
-        if link_dynamic:
-            adb_push(libmace_dynamic_library_path, phone_data_dir,
-                     serialno)
-            push_depended_so_libs(libmace_dynamic_library_path, abi,
-                                  phone_data_dir, serialno)
-
-        adb_push("%s/%s" % (benchmark_binary_dir, benchmark_binary_name),
-                 phone_data_dir,
-                 serialno)
-
-        adb_cmd = [
-            "LD_LIBRARY_PATH=%s" % phone_data_dir,
-            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
-            "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
-            phone_data_dir,
-            "MACE_INTERNAL_STORAGE_PATH=%s" % internal_storage_dir,
-            "MACE_OPENCL_PROFILING=1",
-            "%s/%s" % (phone_data_dir, benchmark_binary_name),
-            "--model_name=%s" % model_tag,
-            "--input_node=%s" % ",".join(input_nodes),
-            "--output_node=%s" % ",".join(output_nodes),
-            "--input_shape=%s" % ":".join(input_shapes),
-            "--output_shape=%s" % ":".join(output_shapes),
-            "--input_file=%s/%s" % (phone_data_dir, input_file_name),
-            "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag),
-            "--device=%s" % device_type,
-            "--omp_num_threads=%s" % omp_num_threads,
-            "--cpu_affinity_policy=%s" % cpu_affinity_policy,
-            "--gpu_perf_hint=%s" % gpu_perf_hint,
-            "--gpu_priority_hint=%s" % gpu_priority_hint,
-            "--model_file=%s" % mace_model_phone_path,
-            "--opencl_binary_file=%s/%s" %
-            (phone_data_dir, os.path.basename(opencl_binary_file)),
-            "--opencl_parameter_file=%s/%s" %
-            (phone_data_dir, os.path.basename(opencl_parameter_file)),
-        ]
-        adb_cmd = ' '.join(adb_cmd)
-        cmd_file_name = "%s-%s-%s" % ('cmd_file', model_tag, str(time.time()))
-        adb_cmd_file = "%s/%s" % (phone_data_dir, cmd_file_name)
-        tmp_cmd_file = "%s/%s" % ('/tmp', cmd_file_name)
-        with open(tmp_cmd_file, 'w') as cmd_file:
-            cmd_file.write(adb_cmd)
-        adb_push(tmp_cmd_file, adb_cmd_file, serialno)
-        os.remove(tmp_cmd_file)
-
-        sh.adb(
-            "-s",
-            serialno,
-            "shell",
-            "sh",
-            adb_cmd_file,
-            _fg=True)
-
-        sh.adb(
-            "-s",
-            serialno,
-            "shell",
-            "rm",
-            adb_cmd_file,
-            _fg=True)
-
-    six.print_("Benchmark done!\n")
-
-
 def build_run_throughput_test(abi,
                               serialno,
                               vlog_level,