Merge branch 'master' into one_hot

af6bdde0 · herbakamil · GitHub · 21dc3180 · fafb7998 · af6bdde0
323 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -144,7 +144,7 @@ model_tests:
    - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
    - >
      python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
+      python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=file --model_data_format=file --address_sanitizer || exit 1;
      python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --example --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
      python tools/converter.py benchmark --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=5 --model_graph_format=file --model_data_format=file || exit 1;
      python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=code --model_data_format=file || exit 1;
@@ -195,7 +195,8 @@ extra_tests:
        GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" git clone git@v9.git.n.xiaomi.com:deep-computing/generic-mobile-devices.git
        DEVICE_CONF_FILE=generic-mobile-devices/devices.yml
      fi
-    - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS || exit 1;
+    - python tools/bazel_adb_run.py --target="//mace/utils:utils_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS || exit 1;
+    - python tools/bazel_adb_run.py --target="//mace/port:port_test" --device_yml=${DEVICE_CONF_FILE} --run_target=True --stdout_processor=unittest_stdout_processor --target_abis=armeabi-v7a,arm64-v8a,arm64 --target_socs=$TARGET_SOCS || exit 1;
 so_size_check:
  stage: so_size_check

--- a/BUILD
+++ b/BUILD
--- a/docs/development/how_to_debug.rst
+++ b/docs/development/how_to_debug.rst
@@ -101,17 +101,20 @@ MACE also provides model visualization HTML generated in `builds` directory, gen
 Debug engine using log
 --------------------------
-Mace defines two sorts of logs: one is for users (LOG), the other is for developers (VLOG).
+MACE implements a similar logging mechanism like `glog <https://github.com/google/glog>`__.
+There are two types of logs, LOG for normal logging and VLOG for debugging.
-LOG includes four levels, i.e, ``INFO``, ``WARNING``, ``ERROR``, ``FATAL``;
+LOG includes four levels, sorted by severity level: ``INFO``, ``WARNING``, ``ERROR``, ``FATAL``.
-Environment variable ``MACE_CPP_MIN_LOG_LEVEL`` can be set to specify log level of users, e.g.,
+The logging severity threshold can be configured via environment variable, e.g. ``MACE_CPP_MIN_LOG_LEVEL=WARNING`` to set as ``WARNING``.
-``set MACE_CPP_MIN_LOG_LEVEL=0`` will enable ``INFO`` log level, while ``set MACE_CPP_MIN_LOG_LEVEL=4`` will enable ``FATAL`` log level.
+Only the log messages with equal or above the specified severity threshold will be printed, the default threshold is ``INFO``.
+We don't support integer log severity value like `glog <https://github.com/google/glog>`__, because they are confusing with VLOG.
+VLOG is verbose logging which is logged as ``LOG(INFO)``. VLOG also has more detailed integer verbose levels, like 0, 1, 2, 3, etc.
+The threshold can be configured through environment variable, e.g. ``MACE_CPP_MIN_VLOG_LEVEL=2`` to set as ``2``.
+With VLOG, the lower the verbose level, the more likely messages are to be logged. For example, when the threshold is set
+to 2, both ``VLOG(1)``, ``VLOG(2)`` log messages will be printed, but ``VLOG(3)`` and highers won't. 
-VLOG level is specified by numbers, e.g., 0, 1, 2. Environment variable ``MACE_CPP_MIN_VLOG_LEVEL`` can be set to specify vlog level.
+By using ``mace_run`` tool, VLOG level can be easily set by option, e.g.,
-Logs with higher levels than which is specified will be printed. So simply specifying a very large level number will make all logs printed.
-By using Mace run tool, vlog level can be easily set by option, e.g.,
 	.. code:: sh
@@ -168,9 +171,3 @@ things may be a little bit complicated.
 		# then you can use it as host gdb, e.g.,
 		bt
--- a/docs/installation/env_requirement.rst
+++ b/docs/installation/env_requirement.rst
@@ -41,7 +41,7 @@ For Bazel, install it following installation guide. For python dependencies,
 	.. code:: sh
-		pip install -U --user setup/requirements.txt
+		pip install -U --user -r setup/requirements.txt
@@ -83,7 +83,7 @@ For python dependencies,
 	.. code:: sh
-		pip install -U --user setup/optionals.txt
+		pip install -U --user -r setup/optionals.txt
 .. note::

--- a/docs/installation/using_docker.rst
+++ b/docs/installation/using_docker.rst
@@ -15,18 +15,18 @@ In most cases, the ``lite edition`` image can satisfy developer's basic needs.
 .. code:: sh
-    # Pull lite edition docker image
+    # You can pull lite edition docker image from docker repo (recommended)
    docker pull registry.cn-hangzhou.aliyuncs.com/xiaomimace/mace-dev-lite
-    # Build lite edition docker image
+    # Or build lite edition docker image by yourself
    docker build -t registry.cn-hangzhou.aliyuncs.com/xiaomimace/mace-dev-lite ./docker/mace-dev-lite
 - ``full edition`` docker image (which contains multiple NDK versions and other dev tools).
 .. code:: sh
-    # Pull full edition docker image
+    # You can pull full edition docker image from docker repo (recommended)
    docker pull registry.cn-hangzhou.aliyuncs.com/xiaomimace/mace-dev
-    # Build full edition docker image
+    # Or build full edition docker image by yourself
    docker build -t registry.cn-hangzhou.aliyuncs.com/xiaomimace/mace-dev ./docker/mace-dev
 .. note::

--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -81,7 +81,7 @@ in one deployment file.
    * - backend
      - The onnx backend framework for validation, could be [tensorflow, caffe2, pytorch], default is tensorflow.
    * - runtime
-      - The running device, one of [cpu, gpu, dsp, cpu_gpu]. cpu_gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU.
+      - The running device, one of [cpu, gpu, dsp, cpu+gpu]. cpu+gpu contains CPU and GPU model definition so you can run the model on both CPU and GPU.
    * - data_type
      - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP.
    * - input_data_types
@@ -421,11 +421,6 @@ the detailed information is in :doc:`benchmark`.
          - 3
          - ``run``/``benchmark``
          - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH
-        * - --gpu_perf_hint
-          - int
-          - 3
-          - ``run``/``benchmark``
-          - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH
        * - --gpu_priority_hint
          - int
          - 3

--- a/mace/BUILD
+++ b/mace/BUILD
@@ -6,6 +6,22 @@ config_setting(
    visibility = ["//visibility:public"],
 )
+config_setting(
+    name = "linux",
+    define_values = {
+        "linux": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+config_setting(
+    name = "darwin",
+    define_values = {
+        "darwin": "true",
+    },
+    visibility = ["//visibility:public"],
+)
 config_setting(
    name = "android_armv7",
    values = {
@@ -62,6 +78,17 @@ config_setting(
    visibility = ["//visibility:public"],
 )
+config_setting(
+    name = "hta_enabled",
+    define_values = {
+        "hta": "true",
+    },
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+    },
+    visibility = ["//visibility:public"],
+)
 config_setting(
    name = "openmp_enabled",
    define_values = {

--- a/mace/benchmark/BUILD
+++ b/mace/benchmark/BUILD
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -21,9 +21,11 @@
 #include <thread>  // NOLINT(build/c++11)
 #include "gflags/gflags.h"
+#include "mace/port/env.h"
+#include "mace/port/file_system.h"
 #include "mace/public/mace.h"
 #include "mace/utils/logging.h"
-#include "mace/utils/utils.h"
+#include "mace/utils/math.h"
 #include "mace/benchmark/statistics.h"
 #ifdef MODEL_GRAPH_FORMAT_CODE
 #include "mace/codegen/engine/mace_engine_factory.h"
@@ -31,24 +33,6 @@
 namespace mace {
 namespace benchmark {
-namespace str_util {
-std::vector<std::string> Split(const std::string &str, char delims) {
-  std::vector<std::string> result;
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    size_t next_offset = tmp.find(delims);
-    result.push_back(tmp.substr(0, next_offset));
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-  return result;
-}
-}  //  namespace str_util
 void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
  std::string tmp = str;
@@ -90,6 +74,18 @@ DeviceType ParseDeviceType(const std::string &device_str) {
  }
 }
+DataFormat ParseDataFormat(const std::string &data_format_str) {
+  if (data_format_str == "NHWC") {
+    return DataFormat::NHWC;
+  } else if (data_format_str == "NCHW") {
+    return DataFormat::NCHW;
+  } else if (data_format_str == "OIHW") {
+    return DataFormat::OIHW;
+  } else {
+    return DataFormat::DF_NONE;
+  }
+}
 bool RunInference(MaceEngine *engine,
                  const std::map<std::string, mace::MaceTensor> &input_infos,
                  std::map<std::string, mace::MaceTensor> *output_infos,
@@ -168,6 +164,12 @@ DEFINE_string(output_node, "output_node0,output_node1",
              "output nodes, separated by comma");
 DEFINE_string(input_shape, "", "input shape, separated by colon and comma");
 DEFINE_string(output_shape, "", "output shape, separated by colon and comma");
+DEFINE_string(input_data_format,
+              "NHWC",
+              "input data formats, NONE|NHWC|NCHW");
+DEFINE_string(output_data_format,
+              "NHWC",
+              "output data formats, NONE|NHWC|NCHW");
 DEFINE_string(input_file, "", "input file name");
 DEFINE_int32(max_num_runs, 100, "max number of runs");
 DEFINE_double(max_seconds, 10.0, "max number of seconds to run");
@@ -213,14 +215,10 @@ int Main(int argc, char **argv) {
  std::unique_ptr<OpStat> statistician(new OpStat());
-  std::vector<std::string> input_names =
+  std::vector<std::string> input_names = Split(FLAGS_input_node, ',');
-      str_util::Split(FLAGS_input_node, ',');
+  std::vector<std::string> output_names = Split(FLAGS_output_node, ',');
-  std::vector<std::string> output_names =
+  std::vector<std::string> input_shapes = Split(FLAGS_input_shape, ':');
-      str_util::Split(FLAGS_output_node, ',');
+  std::vector<std::string> output_shapes = Split(FLAGS_output_shape, ':');
-  std::vector<std::string> input_shapes =
-      str_util::Split(FLAGS_input_shape, ':');
-  std::vector<std::string> output_shapes =
-      str_util::Split(FLAGS_output_shape, ':');
  const size_t input_count = input_shapes.size();
  const size_t output_count = output_shapes.size();
@@ -233,6 +231,19 @@ int Main(int argc, char **argv) {
    ParseShape(output_shapes[i], &output_shape_vec[i]);
  }
+  std::vector<std::string> raw_input_data_formats =
+      Split(FLAGS_input_data_format, ',');
+  std::vector<std::string> raw_output_data_formats =
+      Split(FLAGS_output_data_format, ',');
+  std::vector<DataFormat> input_data_formats(input_count);
+  std::vector<DataFormat> output_data_formats(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]);
+  }
  mace::DeviceType device_type = ParseDeviceType(FLAGS_device);
  // configuration
@@ -273,41 +284,46 @@ int Main(int argc, char **argv) {
  std::shared_ptr<mace::MaceEngine> engine;
  MaceStatus create_engine_status;
  // Create Engine
-  std::vector<unsigned char> model_graph_data;
+  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_graph_data;
  if (FLAGS_model_file != "") {
-    if (!mace::ReadBinaryFile(&model_graph_data, FLAGS_model_file)) {
+    auto fs = GetFileSystem();
+    auto status = fs->NewReadOnlyMemoryRegionFromFile(FLAGS_model_file.c_str(),
+        &model_graph_data);
+    if (status != MaceStatus::MACE_SUCCESS) {
      LOG(FATAL) << "Failed to read file: " << FLAGS_model_file;
    }
  }
-  const unsigned char *model_weights_data = nullptr;
+  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_weights_data;
-  size_t model_weights_data_size = 0;
  if (FLAGS_model_data_file != "") {
-    MemoryMap(FLAGS_model_data_file,
+    auto fs = GetFileSystem();
-              &model_weights_data,
+    auto status = fs->NewReadOnlyMemoryRegionFromFile(
-              &model_weights_data_size);
+        FLAGS_model_data_file.c_str(),
-    MACE_CHECK(model_weights_data != nullptr && model_weights_data_size != 0);
+        &model_weights_data);
+    if (status != MaceStatus::MACE_SUCCESS) {
+      LOG(FATAL) << "Failed to read file: " << FLAGS_model_data_file;
+    }
+    MACE_CHECK(model_weights_data->length() > 0);
  }
 #ifdef MODEL_GRAPH_FORMAT_CODE
-  create_engine_status =
+  create_engine_status = CreateMaceEngineFromCode(FLAGS_model_name,
-        CreateMaceEngineFromCode(FLAGS_model_name,
+      reinterpret_cast<const unsigned char *>(model_weights_data->data()),
-                                 model_weights_data,
+      model_weights_data->length(),
-                                 model_weights_data_size,
+      input_names,
-                                 input_names,
+      output_names,
-                                 output_names,
+      config,
-                                 config,
+      &engine);
-                                 &engine);
 #else
-  create_engine_status =
+  create_engine_status = CreateMaceEngineFromProto(
-      CreateMaceEngineFromProto(model_graph_data.data(),
+      reinterpret_cast<const unsigned char *>(model_graph_data->data()),
-                                model_graph_data.size(),
+      model_graph_data->length(),
-                                model_weights_data,
+      reinterpret_cast<const unsigned char *>(model_weights_data->data()),
-                                model_weights_data_size,
+      model_weights_data->length(),
-                                input_names,
+      input_names,
-                                output_names,
+      output_names,
-                                config,
+      config,
-                                &engine);
+      &engine);
 #endif
  if (create_engine_status != MaceStatus::MACE_SUCCESS) {
    LOG(FATAL) << "Create engine error, please check the arguments";
@@ -333,7 +349,8 @@ int Main(int argc, char **argv) {
      LOG(INFO) << "Open input file failed";
      return -1;
    }
-    inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in);
+    inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in,
+        input_data_formats[i]);
  }
  for (size_t i = 0; i < output_count; ++i) {
@@ -344,7 +361,8 @@ int Main(int argc, char **argv) {
    auto buffer_out = std::shared_ptr<float>(new float[output_size],
                                             std::default_delete<float[]>());
    outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
-                                                buffer_out);
+                                                buffer_out,
+                                                output_data_formats[i]);
  }
  int64_t warmup_time_us = 0;
@@ -380,10 +398,6 @@ int Main(int argc, char **argv) {
  statistician->PrintStat();
-  if (model_weights_data != nullptr) {
-    MemoryUnMap(model_weights_data, model_weights_data_size);
-  }
  return 0;
 }

--- a/mace/benchmark/model_throughput_test.cc
+++ b/mace/benchmark/model_throughput_test.cc
@@ -23,8 +23,7 @@
 *          --dsp_model_data_file=dsp_model_data.data \
 *          --run_seconds=10
 */
-#include <malloc.h>
+#include <cstdint>
-#include <stdint.h>
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
@@ -33,7 +32,7 @@
 #include "gflags/gflags.h"
 #include "mace/public/mace.h"
-#include "mace/utils/env_time.h"
+#include "mace/port/env.h"
 #include "mace/utils/logging.h"
 #include "mace/core/types.h"

--- a/mace/codegen/BUILD
+++ b/mace/codegen/BUILD
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -10,11 +10,14 @@ licenses(["notice"])  # Apache 2.0
 load(
    "//mace:mace.bzl",
    "if_android",
+    "if_android_armv7",
    "if_hexagon_enabled",
-    "if_not_hexagon_enabled",
+    "if_hta_enabled",
-    "if_openmp_enabled",
+    "if_hexagon_or_hta_enabled",
    "if_neon_enabled",
+    "if_not_hexagon_enabled",
    "if_opencl_enabled",
+    "if_openmp_enabled",
    "if_quantize_enabled",
 )
@@ -32,17 +35,24 @@ cc_library(
        [
            "runtime/opencl/*.cc",
        ],
-    )) + if_hexagon_enabled(glob([
+    )) + if_hexagon_enabled([
-        "runtime/hexagon/*.cc",
+        "runtime/hexagon/hexagon_dsp_wrapper.cc",
-    ])),
+    ]) + if_hta_enabled([
+        "runtime/hexagon/hexagon_hta_wrapper.cc",
+    ]),
    hdrs = glob([
        "*.h",
        "runtime/cpu/*.h",
-    ]) + if_opencl_enabled(glob(
+    ]) + if_opencl_enabled(glob([
-        [
+        "runtime/opencl/*.h",
-            "runtime/opencl/*.h",
+    ])) + if_hexagon_or_hta_enabled(glob([
-        ],
+        "runtime/hexagon/hexagon_control_wrapper.h",
-    )) + if_hexagon_enabled(glob(["runtime/hexagon/*.h"])),
+        "runtime/hexagon/hexagon_device.h",
+    ])) + if_hexagon_enabled(glob([
+        "runtime/hexagon/*dsp*.h",
+    ])) + if_hta_enabled(glob([
+        "runtime/hexagon/*hta*.h",
+    ])),
    copts = [
        "-Werror",
        "-Wextra",
@@ -56,17 +66,20 @@ cc_library(
        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
    ]) + if_neon_enabled([
        "-DMACE_ENABLE_NEON",
+    ]) + if_android_armv7([
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
    ]),
-    linkopts = ["-ldl"] + if_android([
+    linkopts = ["-ldl"],
-        "-pie",
-        "-lm",
-    ]),
    deps = [
        "//mace/codegen:generated_version",
        "//mace/proto:mace_cc",
        "//mace/utils",
+        "//mace/port",
        "@half//:half",
    ] + if_opencl_enabled([
        ":opencl_headers",
@@ -75,6 +88,8 @@ cc_library(
        "@gemmlowp",
    ]) + if_hexagon_enabled([
        "//third_party/nnlib:libhexagon",
+    ]) + if_hta_enabled([
+        "//third_party/hta",
    ]),
 )

--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -15,14 +15,13 @@
 #ifndef MACE_CORE_ALLOCATOR_H_
 #define MACE_CORE_ALLOCATOR_H_
-#include <stdlib.h>
+#include <cstdlib>
-#include <string.h>
 #include <map>
 #include <limits>
 #include <vector>
 #include <cstring>
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/core/types.h"
 #include "mace/core/runtime_failure_mock.h"
 #include "mace/public/mace.h"

--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -21,8 +21,9 @@
 #include <functional>
 #include "mace/core/allocator.h"
-#include "mace/core/macros.h"
 #include "mace/core/types.h"
+#include "mace/utils/logging.h"
+#include "mace/utils/macros.h"
 namespace mace {
 namespace core {
@@ -434,16 +435,11 @@ class BufferSlice : public BufferBase {
  }
  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
-    MACE_UNUSED(offset);
+    return buffer_->Map(offset_ + offset, length, pitch);
-    MACE_UNUSED(length);
-    MACE_UNUSED(pitch);
-    MACE_NOT_IMPLEMENTED;
-    return nullptr;
  }
  void UnMap(void *mapped_ptr) const {
-    MACE_UNUSED(mapped_ptr);
+    buffer_->UnMap(mapped_ptr);
-    MACE_NOT_IMPLEMENTED;
  }
  void Map(std::vector<size_t> *pitch) {
@@ -507,7 +503,7 @@ class ScratchBuffer: public Buffer {
  virtual ~ScratchBuffer() {}
  MaceStatus GrowSize(const index_t size) {
-    if (size > size_) {
+    if (offset_ + size > size_) {
      VLOG(1) << "Grow scratch size to: " << size;
      MACE_CHECK(offset_ == 0, "scratch is being used, cannot grow size");
      return Resize(size);

--- a/mace/core/device.cc
+++ b/mace/core/device.cc
@@ -15,16 +15,17 @@
 #include "mace/core/device.h"
 #include "mace/core/buffer.h"
+#include "mace/utils/memory.h"
 namespace mace {
 CPUDevice::CPUDevice(const int num_threads,
                     const CPUAffinityPolicy policy,
                     const bool use_gemmlowp)
-    : cpu_runtime_(new CPURuntime(num_threads,
+    : cpu_runtime_(make_unique<CPURuntime>(num_threads,
-                                  policy,
+                                           policy,
-                                  use_gemmlowp)),
+                                           use_gemmlowp)),
-      scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
+      scratch_buffer_(make_unique<ScratchBuffer>(GetCPUAllocator())) {}
 CPUDevice::~CPUDevice() = default;

--- a/mace/core/future.h
+++ b/mace/core/future.h
@@ -20,11 +20,10 @@
 #include <vector>
 #include "mace/utils/logging.h"
+#include "mace/public/mace.h"
 namespace mace {
-struct CallStats;
 // Wait the call to finish and get the stats if param is not nullptr
 struct StatsFuture {
  std::function<void(CallStats *)> wait_fn = [](CallStats *stats) {

--- a/mace/core/kv_storage.cc
+++ b/mace/core/kv_storage.cc
@@ -13,18 +13,18 @@
 // limitations under the License.
 #include <fcntl.h>
-#include <limits.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include <climits>
 #include <algorithm>
 #include <cstring>
 #include <memory>
 #include <utility>
 #include "mace/core/kv_storage.h"
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/utils/logging.h"
 namespace mace {

--- a/mace/core/memory_optimizer.cc
+++ b/mace/core/memory_optimizer.cc
@@ -21,8 +21,9 @@
 #include <unordered_set>
 #include "mace/core/arg_helper.h"
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/utils/logging.h"
+#include "mace/public/mace.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_util.h"
@@ -61,12 +62,22 @@ void MemoryOptimizer::UpdateTensorRef(const mace::OperatorDef *op_def) {
 }
 MemoryBlock MemoryOptimizer::CreateMemoryBlock(
-    std::vector<int64_t> shape,
+    const OperatorDef *op_def,
+    int output_idx,
    DataType dt,
-    mace::MemoryType mem_type) {
+    MemoryType mem_type) {
+  auto shape = std::vector<int64_t>(
+      op_def->output_shape(output_idx).dims().begin(),
+      op_def->output_shape(output_idx).dims().end());
  MemoryBlock block;
 #ifdef MACE_ENABLE_OPENCL
  if (mem_type == MemoryType::GPU_IMAGE) {
+    OpenCLBufferType buffer_type = OpenCLBufferType::IN_OUT_CHANNEL;
+    if (op_def->type() == "BufferTransform") {
+      buffer_type = static_cast<OpenCLBufferType>(
+          ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+              *op_def, "buffer_type", OpenCLBufferType::IN_OUT_CHANNEL));
+    }
    std::vector<size_t> image_shape;
    if (shape.size() == 1) {
      shape = {shape[0], 1, 1, 1};
@@ -75,9 +86,7 @@ MemoryBlock MemoryOptimizer::CreateMemoryBlock(
    } else {
      MACE_CHECK(shape.size() == 4) << "GPU only support 1D/2D/4D input";
    }
-    OpenCLUtil::CalImage2DShape(shape,
+    OpenCLUtil::CalImage2DShape(shape, buffer_type, &image_shape);
-                                OpenCLBufferType::IN_OUT_CHANNEL,
-                                &image_shape);
    block.set_x(image_shape[0]);
    block.set_y(image_shape[1]);
    return block;
@@ -95,7 +104,7 @@ MemoryBlock MemoryOptimizer::CreateMemoryBlock(
 void MemoryOptimizer::Optimize(
    const mace::OperatorDef *op_def,
-    const std::unordered_map<std::string, MemoryType> &mem_types) {
+    const std::unordered_map<std::string, MemoryType> *mem_types) {
  MACE_LATENCY_LOGGER(2, "Optimize memory");
  if (op_def->output_size() != op_def->output_shape_size()) {
    VLOG(1) << op_def->name()
@@ -117,6 +126,8 @@ void MemoryOptimizer::Optimize(
      op_def->output_type_size());
  DataType dt;
+  bool has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+      *op_def, "has_data_format", 0) != 0;
  int output_size = op_def->output_size();
  for (int i = 0; i < output_size; ++i) {
    if (i < op_def->output_type_size()) {
@@ -127,22 +138,15 @@ void MemoryOptimizer::Optimize(
    int best_mem_id = -1;
    MemoryType mem_type = MemoryType::CPU_BUFFER;
    if (device == DeviceType::GPU) {
-      mem_type = mem_types.at(op_def->output(i));
+      mem_type = mem_types->at(op_def->output(i));
    }
-    auto shape = std::vector<int64_t>(
+    MemoryBlock op_mem_block = CreateMemoryBlock(op_def, i, dt, mem_type);
-        op_def->output_shape(i).dims().begin(),
-        op_def->output_shape(i).dims().end());
-    MemoryBlock op_mem_block = CreateMemoryBlock(shape, dt, mem_type);
    MemoryBlock best_mem_block;
    if (IsMemoryReuseOp(op_def->type())) {
      if (tensor_mem_map_.count(op_def->input(0)) == 1) {
-        best_mem_id = tensor_mem_map_[op_def->input(0)].first;
+        best_mem_id = tensor_mem_map_.at(op_def->input(0)).mem_id;
      }
    } else {
-      auto shape = std::vector<int64_t>(
-          op_def->output_shape(i).dims().begin(),
-          op_def->output_shape(i).dims().end());
      int64_t op_mem_size = op_mem_block.x() * op_mem_block.y();
      int64_t best_added_mem_size = LLONG_MAX;
      int64_t best_wasted_mem_size = LLONG_MAX;
@@ -206,7 +210,8 @@ void MemoryOptimizer::Optimize(
      } else {
        mem_ref_count_[best_mem_id] = 1;
      }
-      tensor_mem_map_[op_def->output(i)] = std::make_pair(best_mem_id, dt);
+      tensor_mem_map_.emplace(op_def->output(i), TensorMemInfo(best_mem_id,
+          dt, has_data_format));
    }
  }
@@ -218,7 +223,7 @@ void MemoryOptimizer::Optimize(
      tensor_ref_count_[input_name] -= 1;
      if (tensor_ref_count_.at(input_name) == 0 &&
          tensor_mem_map_.count(input_name) == 1) {
-        int mem_id = tensor_mem_map_.at(input_name).first;
+        int mem_id = tensor_mem_map_.at(input_name).mem_id;
        mem_ref_count_[mem_id] -= 1;
        if (mem_ref_count_.at(mem_id) == 0) {
          idle_blocks_.insert(mem_id);
@@ -238,7 +243,7 @@ const std::vector<MemoryBlock>& MemoryOptimizer::mem_blocks() const {
  return mem_blocks_;
 }
-const std::unordered_map<std::string, std::pair<int, DataType>>&
+const std::unordered_map<std::string, MemoryOptimizer::TensorMemInfo>&
    MemoryOptimizer::tensor_mem_map() const {
  return tensor_mem_map_;
 }

--- a/mace/core/memory_optimizer.h
+++ b/mace/core/memory_optimizer.h
@@ -77,31 +77,44 @@ class MemoryBlock {
 };
 class MemoryOptimizer {
+ public:
+  struct TensorMemInfo {
+    int mem_id;
+    DataType data_type;
+    bool has_data_format;
+    TensorMemInfo(int mem_id, DataType data_type, bool has_data_format) :
+        mem_id(mem_id), data_type(data_type), has_data_format(has_data_format)
+    {}
+  };
 public:
  static bool IsMemoryReuseOp(const std::string &op_type);
  void UpdateTensorRef(const std::string &tensor_name);
  void UpdateTensorRef(const OperatorDef *op_def);
-  void Optimize(const OperatorDef *op_def,
+  void Optimize(
-                const std::unordered_map<std::string, MemoryType> &mem_types);
+      const OperatorDef *op_def,
+      const std::unordered_map<std::string, MemoryType> *mem_types = nullptr);
  const std::vector<MemoryBlock> &mem_blocks() const;
-  const std::unordered_map<std::string,
+  const std::unordered_map<std::string, TensorMemInfo> &tensor_mem_map() const;
-                           std::pair<int, DataType>> &tensor_mem_map() const;
  std::string DebugInfo() const;
 private:
-  MemoryBlock CreateMemoryBlock(std::vector<int64_t> shape,
+  MemoryBlock CreateMemoryBlock(
-                                DataType dt,
+      const OperatorDef *op_def,
-                                MemoryType mem_type);
+      int output_idx,
+      DataType dt,
+      MemoryType mem_type);
 private:
  std::unordered_map<std::string, int> tensor_ref_count_;
  std::vector<MemoryBlock> mem_blocks_;
  // tensor name : <mem_id, data_type>
  // Buffer Memory do not different data type, so store the data type.
-  std::unordered_map<std::string, std::pair<int, DataType>> tensor_mem_map_;
+  std::unordered_map<std::string, TensorMemInfo> tensor_mem_map_;
  std::unordered_map<int, int> mem_ref_count_;
  std::set<int> idle_blocks_;
 };

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -19,14 +19,17 @@
 #include <utility>
 #include "mace/core/future.h"
-#include "mace/core/macros.h"
 #include "mace/core/memory_optimizer.h"
 #include "mace/core/net.h"
 #include "mace/core/op_context.h"
 #include "mace/public/mace.h"
-#include "mace/utils/memory_logging.h"
+#include "mace/port/env.h"
+#include "mace/utils/conf_util.h"
+#include "mace/utils/logging.h"
+#include "mace/utils/macros.h"
+#include "mace/utils/math.h"
+#include "mace/utils/memory.h"
 #include "mace/utils/timer.h"
-#include "mace/utils/utils.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/opencl_util.h"
@@ -38,12 +41,15 @@ namespace {
 struct InternalOutputInfo {
  InternalOutputInfo(const MemoryType mem_type,
                     const DataType dtype,
+                     const DataFormat data_format,
                     const std::vector<index_t> &shape,
                     int op_idx)
-      : mem_type(mem_type), dtype(dtype), shape(shape), op_idx(op_idx) {}
+      : mem_type(mem_type), dtype(dtype), data_format(data_format),
+        shape(shape), op_idx(op_idx) {}
  MemoryType mem_type;  // transformed memory type
  DataType dtype;
+  DataFormat data_format;
  std::vector<index_t> shape;  // tensor shape
  int op_idx;  // operation which generate the tensor
 };
@@ -70,12 +76,12 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
    const OpRegistryBase *op_registry,
    OpConstructContext *construct_context,
    std::shared_ptr<OperatorDef> op_def,
-    DataFormat data_format_flag,
+    bool has_data_format,
    bool is_quantize_model) {
  // Create the Operation
  DeviceType target_device_type = target_device_->device_type();
  DeviceType device_type = DeviceType::CPU;
-  construct_context->set_device(cpu_device_);
+  construct_context->set_device(cpu_device_.get());
  construct_context->set_operator_def(op_def);
  construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
  // Get available devices
@@ -100,8 +106,7 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
  if (!is_quantize_model && device_type == DeviceType::CPU &&
      op_def->output_shape_size() == op_def->output_size()) {
    for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
-      if (data_format_flag == NHWC &&
+      if (has_data_format && op_def->output_shape(out_idx).dims_size() == 4) {
-          op_def->output_shape(out_idx).dims_size() == 4) {
        //  NHWC -> NCHW
        std::vector<index_t> output_shape =
            TransposeShape<index_t, index_t>(
@@ -115,9 +120,8 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
      }
    }
  }
-  std::unique_ptr<Operation> op(
-      op_registry->CreateOperation(construct_context, device_type));
+  return op_registry->CreateOperation(construct_context, device_type);
-  return std::move(op);
 }
 SerialNet::SerialNet(const OpRegistryBase *op_registry,
@@ -129,17 +133,11 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
      ws_(ws),
      target_device_(target_device),
      cpu_device_(
-          new CPUDevice(target_device->cpu_runtime()->num_threads(),
+          make_unique<CPUDevice>(
-                        target_device->cpu_runtime()->policy(),
+              target_device->cpu_runtime()->num_threads(),
-                        target_device->cpu_runtime()->use_gemmlowp())) {
+              target_device->cpu_runtime()->policy(),
+              target_device->cpu_runtime()->use_gemmlowp())) {
  MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
-  // output tensor : related information
-  std::unordered_map<std::string, InternalOutputInfo> output_map;
-  // used for memory optimization
-  std::unordered_map<std::string, MemoryType> output_mem_map;
-  std::unordered_set<std::string> transformed_set;
-  // add input information
-  MemoryType target_mem_type;
  // quantize model flag
  bool is_quantize_model = IsQuantizedModel(*net_def);
  // Tensor Shape map
@@ -149,20 +147,18 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
      continue;
    }
    for (int i = 0; i < op.output_size(); ++i) {
-      tensor_shape_map[op.output(i)] =
+      tensor_shape_map[op.output(i)] = std::vector<index_t>(
-          std::move(std::vector<index_t>(op.output_shape(i).dims().begin(),
+          op.output_shape(i).dims().begin(),
-                                         op.output_shape(i).dims().end()));
+          op.output_shape(i).dims().end());
    }
  }
  for (auto &tensor : net_def->tensors()) {
    tensor_shape_map[tensor.name()] =
-        std::move(std::vector<index_t>(tensor.dims().begin(),
+      std::vector<index_t>(tensor.dims().begin(), tensor.dims().end());
-                                       tensor.dims().end()));
  }
-  DataFormat data_format_flag = NHWC;
+  bool has_data_format = false;
  if (target_device_->device_type() == DeviceType::CPU) {
-    target_mem_type = MemoryType::CPU_BUFFER;
    for (auto &input_info : net_def->input_info()) {
      std::vector<index_t> input_shape =
          std::vector<index_t>(input_info.dims().begin(),
@@ -170,38 +166,45 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
      // update tensor shape map
      tensor_shape_map[input_info.name()] = input_shape;
      // Only could be NONE or NHWC
-      auto input_data_format = static_cast<DataFormat>(
+      DataFormat input_data_format = static_cast<DataFormat>(
          input_info.data_format());
-      if (!is_quantize_model && input_data_format == NHWC &&
+      has_data_format = has_data_format ||
+          (input_data_format != DataFormat::DF_NONE);
+      if (!is_quantize_model && input_data_format == DataFormat::NHWC &&
          input_info.dims_size() == 4) {
        // NHWC -> NCHW
        input_shape =
            TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
-      } else if (input_data_format == DataFormat::DF_NONE) {
-        data_format_flag = DataFormat::DF_NONE;
      }
-      output_map.emplace(input_info.name(), InternalOutputInfo(
-          target_mem_type, DataType::DT_FLOAT, input_shape, -1));
    }
  }
 #ifdef MACE_ENABLE_OPENCL
-  else {  // GPU  NOLINT[readability/braces]
+  // output tensor : related information
+  std::unordered_map<std::string, InternalOutputInfo> output_map;
+  // used for memory optimization
+  std::unordered_map<std::string, MemoryType> output_mem_map;
+  std::unordered_set<std::string> transformed_set;
+  // add input information
+  MemoryType target_mem_type;
+  // default data format of output tensor
+  DataFormat default_output_df = DataFormat::DF_NONE;
+  if (target_device_->device_type() == DeviceType::GPU) {
    target_mem_type = MemoryType::GPU_BUFFER;
    for (auto &input_info : net_def->input_info()) {
-      auto input_data_format = static_cast<DataFormat>(
+      DataFormat input_data_format = static_cast<DataFormat>(
          input_info.data_format());
-      if (input_data_format == DataFormat::DF_NONE) {
+      has_data_format = input_data_format != DataFormat::DF_NONE;
-        data_format_flag = DataFormat::DF_NONE;
-      }
      std::vector<index_t> input_shape =
          std::vector<index_t>(input_info.dims().begin(),
                               input_info.dims().end());
      // update tensor shape map
      tensor_shape_map[input_info.name()] = input_shape;
      output_map.emplace(input_info.name(), InternalOutputInfo(
-          target_mem_type, DataType::DT_FLOAT, input_shape, -1));
+          target_mem_type, DataType::DT_FLOAT, input_data_format,
+          input_shape, -1));
    }
+    default_output_df =
+        has_data_format ? DataFormat::NHWC : DataFormat::DF_NONE;
  }
 #endif  // MACE_ENABLE_OPENCL
@@ -212,7 +215,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
    auto op = CreateOperation(op_registry,
                              &construct_context,
                              op_def,
-                              data_format_flag,
+                              has_data_format,
                              is_quantize_model);
 #ifdef MACE_ENABLE_OPENCL
    // Add input transform operation if necessary
@@ -246,11 +249,13 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
                        << output_info.mem_type << " to "
                        << wanted_in_mem_type
                        << ", from Data Type " << output_info.dtype << " to "
-                        << wanted_in_dt;
+                        << wanted_in_dt << ". with data format "
+                        << output_info.data_format;
                std::string input_name = op_def->input(i);
                op_def->set_input(i, t_input_name);
                auto input_shape = output_info.shape;
                if (output_info.mem_type == MemoryType::CPU_BUFFER &&
+                    output_info.data_format == DataFormat::NCHW &&
                    input_shape.size() == 4) {
                  // NCHW -> NHWC
                  input_shape =
@@ -258,14 +263,15 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
                                                       {0, 2, 3, 1});
                }
                auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
-                    input_name, input_shape, t_input_name,
+                    input_name, input_shape, t_input_name, wanted_in_dt,
-                    wanted_in_dt, wanted_in_mem_type, data_format_flag);
+                    construct_context.GetInputOpenCLBufferType(i),
+                    wanted_in_mem_type, has_data_format);
                OpConstructContext t_construct_context(ws_);
                auto transform_op = CreateOperation(
                    op_registry,
                    &t_construct_context,
                    transform_op_def,
-                    data_format_flag);
+                    has_data_format);
                operators_.emplace_back(std::move(transform_op));
                transformed_set.insert(t_input_name);
                output_mem_map[t_input_name] = wanted_in_mem_type;
@@ -299,6 +305,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
            InternalOutputInfo(
                out_mem_type,
                dt,
+                default_output_df,
                op_def->output_shape().empty() ?
                std::vector<index_t>() :
                std::vector<index_t>(
@@ -340,20 +347,21 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
            output_mem_map[output_info.name()] = target_mem_type;
          }
        }
-        auto output_data_format =
+        bool output_has_data_format =
            static_cast<DataFormat>(output_info.data_format());
        auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
            t_output_name,
            internal_output_info.shape,
            output_info.name(),
            output_info.data_type(),
+            OpenCLBufferType::IN_OUT_CHANNEL,
            target_mem_type,
-            data_format_flag);
+            output_has_data_format);
        auto transform_op = CreateOperation(
            op_registry,
            &construct_context,
            transform_op_def,
-            output_data_format);
+            output_has_data_format);
        operators_.emplace_back(std::move(transform_op));
        // where to do graph reference count.
        mem_optimizer->UpdateTensorRef(transform_op_def.get());
@@ -370,7 +378,11 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
  for (auto &op : operators_) {
    VLOG(2) << "Operator " << op->debug_def().name() << "<" << op->device_type()
            << ", " << op->debug_def().type() << ">";
-    mem_optimizer->Optimize(op->operator_def().get(), output_mem_map);
+#ifdef MACE_ENABLE_OPENCL
+    mem_optimizer->Optimize(op->operator_def().get(), &output_mem_map);
+#else
+    mem_optimizer->Optimize(op->operator_def().get());
+#endif  // MACE_ENABLE_OPENCL
  }
  VLOG(1) << mem_optimizer->DebugInfo();
 }
@@ -384,7 +396,7 @@ MaceStatus SerialNet::Init() {
    if (device_type == target_device_->device_type()) {
      init_context.set_device(target_device_);
    } else {
-      init_context.set_device(cpu_device_);
+      init_context.set_device(cpu_device_.get());
    }
    // Initialize the operation
    MACE_RETURN_IF_ERROR(op->Init(&init_context));
@@ -395,7 +407,7 @@ MaceStatus SerialNet::Init() {
 MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
  MACE_MEMORY_LOGGING_GUARD();
  MACE_LATENCY_LOGGER(1, "Running net");
-  OpContext context(ws_, cpu_device_);
+  OpContext context(ws_, cpu_device_.get());
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
    auto &op = *iter;
    DeviceType device_type = op->device_type();
@@ -408,7 +420,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
    if (device_type == target_device_->device_type()) {
      context.set_device(target_device_);
    } else {
-      context.set_device(cpu_device_);
+      context.set_device(cpu_device_.get());
    }
    CallStats call_stats;
@@ -452,7 +464,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
        bool transpose_a = op->GetOptionalArg<bool>("transpose_a", false);
        kernels = op->Input(0)->shape();
        if (transpose_a) {
-          std::swap(kernels[kernels.size()-2], kernels[kernels.size()-1]);
+          std::swap(kernels[kernels.size() - 2], kernels[kernels.size() - 1]);
        }
      } else if (type.compare("FullyConnected") == 0) {
        kernels = op->Input(1)->shape();
@@ -472,7 +484,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
    VLOG(3) << "Operator " << op->debug_def().name()
            << " has shape: " << MakeString(op->Output(0)->shape());
-    if (EnvEnabled("MACE_LOG_TENSOR_RANGE")) {
+    if (EnvConfEnabled("MACE_LOG_TENSOR_RANGE")) {
      for (int i = 0; i < op->OutputSize(); ++i) {
        if (op->debug_def().quantize_info_size() == 0) {
          int data_type = op->GetOptionalArg("T", static_cast<int>(DT_FLOAT));
@@ -498,16 +510,16 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
            Tensor::MappingGuard guard(op->Output(i));
            auto *output_data = op->Output(i)->data<float>();
            for (index_t j = 0; j < op->Output(i)->size(); ++j) {
-                int index = static_cast<int>((output_data[j] - min_v) / bin_v);
+              int index = static_cast<int>((output_data[j] - min_v) / bin_v);
-                if (index < 0)
+              if (index < 0)
-                  index = 0;
+                index = 0;
-                else if (index > bin_size-1)
+              else if (index > bin_size - 1)
-                  index = bin_size-1;
+                index = bin_size - 1;
-                bin_distribution[index]++;
+              bin_distribution[index]++;
            }
            LOG(INFO) << "Tensor range @@" << op->debug_def().output(i)
-                        << "@@" << min_v << "," << max_v<< "@@"
+                      << "@@" << min_v << "," << max_v << "@@"
-                        << MakeString(bin_distribution);
+                      << MakeString(bin_distribution);
          }
        }
      }

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -59,14 +59,14 @@ class SerialNet : public NetBase {
      const OpRegistryBase *op_registry,
      OpConstructContext *construct_context,
      std::shared_ptr<OperatorDef> op_def,
-      DataFormat input_format,
+      bool has_data_format,
      bool is_quantize_model = false);
 protected:
  Workspace *ws_;
  Device *target_device_;
  // CPU is base device.
-  Device *cpu_device_;
+  std::unique_ptr<Device> cpu_device_;
  std::vector<std::unique_ptr<Operation> > operators_;
  MACE_DISABLE_COPY_AND_ASSIGN(SerialNet);

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -86,6 +86,27 @@ DataType OpConstructContext::GetInputDataType(size_t idx) const {
  return input_data_types_[idx];
 }
+#ifdef MACE_ENABLE_OPENCL
+void OpConstructContext::SetInputOpenCLBufferType(
+    size_t idx, OpenCLBufferType buffer_type) {
+  if (input_opencl_buffer_types_.empty()) {
+    // the default inputs' memory types are same as output memory type.
+    input_opencl_buffer_types_.resize(operator_def_->input_size(),
+                               OpenCLBufferType::IN_OUT_CHANNEL);
+  }
+  MACE_CHECK(idx < input_opencl_buffer_types_.size());
+  input_opencl_buffer_types_[idx] = buffer_type;
+}
+OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType(
+    size_t idx) const {
+  if (input_opencl_buffer_types_.empty()) {
+    return OpenCLBufferType::IN_OUT_CHANNEL;
+  }
+  MACE_CHECK(idx < input_opencl_buffer_types_.size());
+  return input_opencl_buffer_types_[idx];
+}
+#endif  // MACE_ENABLE_OPENCL
 OpInitContext::OpInitContext(Workspace *ws, Device *device)
    : ws_(ws), device_(device) {}

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -26,6 +26,9 @@
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/proto/mace.pb.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif  // MACE_ENABLE_OPENCL
 namespace mace {
@@ -72,6 +75,11 @@ class OpConstructContext {
  DataType GetInputDataType(size_t idx) const;
+#ifdef MACE_ENABLE_OPENCL
+  void SetInputOpenCLBufferType(size_t idx, OpenCLBufferType buffer_type);
+  OpenCLBufferType GetInputOpenCLBufferType(size_t idx) const;
+#endif  // MACE_ENABLE_OPENCL
 private:
  std::shared_ptr<OperatorDef> operator_def_;
  Workspace *ws_;
@@ -81,6 +89,9 @@ class OpConstructContext {
  std::vector<MemoryType> input_mem_types_;
  std::vector<DataType> input_data_types_;
  MemoryType output_mem_type_;  // there is only one output memory type now.
+#ifdef MACE_ENABLE_OPENCL
+  std::vector<OpenCLBufferType> input_opencl_buffer_types_;
+#endif  // MACE_ENABLE_OPENCL
 };
 // memory_optimizer, device

--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -18,9 +18,6 @@
 #include <omp.h>
 #endif
-#include <unistd.h>
-#include <sys/syscall.h>
-#include <sys/types.h>
 #include <algorithm>
 #include <cerrno>
 #include <cstring>
@@ -29,8 +26,9 @@
 #include <utility>
 #include <vector>
-#include "mace/core/macros.h"
+#include "mace/port/env.h"
 #include "mace/public/mace.h"
+#include "mace/utils/macros.h"
 #include "mace/utils/logging.h"
 namespace mace {
@@ -42,101 +40,36 @@ struct CPUFreq {
  float freq;
 };
-namespace {
+enum SchedulePolicy {
+  SCHED_STATIC,
-int GetCPUCount() {
+  SCHED_GUIDED,
-  int cpu_count = 0;
+};
-  std::string cpu_sys_conf = "/proc/cpuinfo";
-  std::ifstream f(cpu_sys_conf);
-  if (!f.is_open()) {
-    LOG(ERROR) << "failed to open " << cpu_sys_conf;
-    return -1;
-  }
-  std::string line;
-  const std::string processor_key = "processor";
-  while (std::getline(f, line)) {
-    if (line.size() >= processor_key.size()
-        && line.compare(0, processor_key.size(), processor_key) == 0) {
-      ++cpu_count;
-    }
-  }
-  if (f.bad()) {
-    LOG(ERROR) << "failed to read " << cpu_sys_conf;
-  }
-  if (!f.eof()) {
-    LOG(ERROR) << "failed to read end of " << cpu_sys_conf;
-  }
-  f.close();
-  VLOG(2) << "CPU cores: " << cpu_count;
-  return cpu_count;
-}
-int GetCPUMaxFreq(std::vector<float> *max_freqs) {
-  int cpu_count = GetCPUCount();
-  for (int cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
-    std::string cpuinfo_max_freq_sys_conf = MakeString(
-        "/sys/devices/system/cpu/cpu",
-        cpu_id,
-        "/cpufreq/cpuinfo_max_freq");
-    std::ifstream f(cpuinfo_max_freq_sys_conf);
-    if (!f.is_open()) {
-      LOG(ERROR) << "failed to open " << cpuinfo_max_freq_sys_conf;
-      return -1;
-    }
-    std::string line;
-    if (std::getline(f, line)) {
-      float freq = strtof(line.c_str(), nullptr);
-      max_freqs->push_back(freq);
-    }
-    if (f.bad()) {
-      LOG(ERROR) << "failed to read " << cpuinfo_max_freq_sys_conf;
-    }
-    f.close();
-  }
-  for (float freq : *max_freqs) {
-    VLOG(2) << "CPU freq: " << freq;
-  }
-  return 0;
-}
-MaceStatus SetThreadAffinity(cpu_set_t mask) {
+namespace {
-#if defined(__ANDROID__)
-  pid_t pid = gettid();
-#else
-  pid_t pid = syscall(SYS_gettid);
-#endif
-  int err = sched_setaffinity(pid, sizeof(mask), &mask);
-  if (err) {
-    LOG(WARNING) << "set affinity error: " << strerror(errno);
-    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
-                      "set affinity error: " + std::string(strerror(errno)));
-  } else {
-    return MaceStatus::MACE_SUCCESS;
-  }
-}
 MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
-                                           const std::vector<size_t> &cpu_ids) {
+                                           const std::vector<size_t> &cpu_ids,
+                                           SchedulePolicy schedule_policy) {
  MaceOpenMPThreadCount = omp_num_threads;
 #ifdef MACE_ENABLE_OPENMP
  VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
          << ", CPU core IDs: " << MakeString(cpu_ids);
-  omp_set_schedule(omp_sched_guided, 1);
+  if (schedule_policy == SCHED_GUIDED) {
+    omp_set_schedule(omp_sched_guided, 1);
+  } else if (schedule_policy == SCHED_STATIC) {
+    omp_set_schedule(omp_sched_static, 0);
+  } else {
+    LOG(WARNING) << "Unknown schedule policy: " << schedule_policy;
+  }
  omp_set_num_threads(omp_num_threads);
 #else
  MACE_UNUSED(omp_num_threads);
+  MACE_UNUSED(schedule_policy);
  LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif
-  // compute mask
-  cpu_set_t mask;
-  CPU_ZERO(&mask);
-  for (auto cpu_id : cpu_ids) {
-    CPU_SET(cpu_id, &mask);
-  }
 #ifdef MACE_ENABLE_OPENMP
  std::vector<MaceStatus> status(omp_num_threads,
                                 MaceStatus::MACE_INVALID_ARGS);
@@ -144,7 +77,7 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
  for (int i = 0; i < omp_num_threads; ++i) {
    VLOG(1) << "Set affinity for OpenMP thread " << omp_get_thread_num()
            << "/" << omp_get_num_threads();
-    status[i] = SetThreadAffinity(mask);
+    status[i] = SchedSetAffinity(cpu_ids);
  }
  for (int i = 0; i < omp_num_threads; ++i) {
    if (status[i] != MaceStatus::MACE_SUCCESS)
@@ -152,8 +85,8 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
  }
  return MaceStatus::MACE_SUCCESS;
 #else
-  MaceStatus status = SetThreadAffinity(mask);
+  MaceStatus status = SchedSetAffinity(cpu_ids);
-  VLOG(1) << "Set affinity without OpenMP: " << mask.__bits[0];
+  VLOG(1) << "Set affinity without OpenMP: " << MakeString(cpu_ids);
  return status;
 #endif
 }
@@ -166,8 +99,9 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
    void *gemm_context) {
  // get cpu frequency info
  std::vector<float> cpu_max_freqs;
-  if (GetCPUMaxFreq(&cpu_max_freqs) == -1 || cpu_max_freqs.size() == 0) {
+  MACE_RETURN_IF_ERROR(GetCPUMaxFreq(&cpu_max_freqs));
-    return MaceStatus::MACE_INVALID_ARGS;
+  if (cpu_max_freqs.empty()) {
+    return MaceStatus::MACE_RUNTIME_ERROR;
  }
  std::vector<CPUFreq> cpu_freq(cpu_max_freqs.size());
@@ -228,6 +162,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
  } else {
    cores_to_use = num_threads_hint;
  }
+  MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0");
  VLOG(2) << "Use " << num_threads_hint << " threads";
  std::vector<size_t> cpu_ids(cores_to_use);
@@ -236,6 +171,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
            << cpu_freq[i].freq;
    cpu_ids[i] = cpu_freq[i].core_id;
  }
+  SchedulePolicy sched_policy = SCHED_GUIDED;
+  if (std::abs(cpu_freq[0].freq - cpu_freq[cores_to_use - 1].freq) < 1e-6) {
+    sched_policy = SCHED_STATIC;
+  }
 #ifdef MACE_ENABLE_QUANTIZE
  if (gemm_context) {
@@ -244,7 +183,9 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
  }
 #endif  // MACE_ENABLE_QUANTIZE
-  return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, cpu_ids);
+  return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint,
+                                         cpu_ids,
+                                         sched_policy);
 }
 }  // namespace mace

--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -22,7 +22,7 @@
 #include "public/gemmlowp.h"
 #endif  // MACE_ENABLE_QUANTIZE
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/public/mace.h"
 #include "mace/utils/logging.h"
@@ -52,13 +52,13 @@ class CPURuntime {
 #ifdef MACE_ENABLE_QUANTIZE
  ~CPURuntime() {
-    if (!gemm_context_) {
+    if (gemm_context_ != nullptr) {
      delete static_cast<gemmlowp::GemmContext*>(gemm_context_);
    }
  }
  gemmlowp::GemmContext *GetGemmlowpContext() {
-    if (!gemm_context_) {
+    if (gemm_context_ == nullptr) {
      gemm_context_ = new gemmlowp::GemmContext();
    }
    return static_cast<gemmlowp::GemmContext*>(gemm_context_);

--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
@@ -15,49 +15,68 @@
 #ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_
 #define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_
+#include <memory>
+#include <utility>
 #include <vector>
-#include "mace/core/runtime/hexagon/quantize.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
-#include "third_party/nnlib/hexagon_nn.h"
 namespace mace {
+struct InOutInfo {
+  InOutInfo(const std::vector<index_t> &shape,
+            const DataType data_type,
+            const float scale,
+            const int32_t zero_point,
+            std::unique_ptr<Tensor> tensor_u8)
+      :  shape(shape),
+         data_type(data_type),
+         scale(scale),
+         zero_point(zero_point),
+         tensor_u8(std::move(tensor_u8)) {}
+  std::vector<index_t> shape;
+  DataType data_type;
+  float scale;
+  int32_t zero_point;
+  std::unique_ptr<Tensor> tensor_u8;
+};
 class HexagonControlWrapper {
 public:
-  HexagonControlWrapper() {}
+  HexagonControlWrapper() = default;
-  int GetVersion();
+  virtual ~HexagonControlWrapper() = default;
-  bool Config();
-  bool Init();
-  bool Finalize();
-  bool SetupGraph(const NetDef &net_def, const unsigned char *model_data);
-  bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
-  bool ExecuteGraphNew(const std::vector<Tensor *> &input_tensors,
-                       std::vector<Tensor *> *output_tensors);
-  bool TeardownGraph();
+  virtual int GetVersion() = 0;
-  void PrintLog();
+  virtual bool Config() = 0;
-  void PrintGraph();
+  virtual bool Init() = 0;
-  void GetPerfInfo();
+  virtual bool Finalize() = 0;
-  void ResetPerfInfo();
+  virtual bool SetupGraph(const NetDef &net_def,
-  void SetDebugLevel(int level);
+                          const unsigned char *model_data) = 0;
+  virtual bool ExecuteGraph(const Tensor &input_tensor,
+                            Tensor *output_tensor) = 0;
+  virtual bool ExecuteGraphNew(const std::vector<Tensor *> &input_tensors,
+                               std::vector<Tensor *> *output_tensors) = 0;
+  virtual bool TeardownGraph() = 0;
+  virtual void PrintLog() = 0;
+  virtual void PrintGraph() = 0;
+  virtual void GetPerfInfo() = 0;
+  virtual void ResetPerfInfo() = 0;
+  virtual void SetDebugLevel(int level) = 0;
- private:
+ protected:
-  static constexpr int NODE_ID_OFFSET = 10000;
+  static constexpr int kNodeIdOffset = 10000;
-  static constexpr int NUM_METADATA = 4;
+  static constexpr int kNumMetaData = 4;
-  inline uint32_t node_id(uint32_t nodeid) { return NODE_ID_OFFSET + nodeid; }
+  inline uint32_t node_id(uint32_t nodeid) { return kNodeIdOffset + nodeid; }
  int nn_id_;
-  Quantizer quantizer_;
-  std::vector<std::vector<index_t>> input_shapes_;
+  std::vector<InOutInfo> input_info_;
-  std::vector<std::vector<index_t>> output_shapes_;
+  std::vector<InOutInfo> output_info_;
-  std::vector<DataType> input_data_types_;
+  int num_inputs_;
-  std::vector<DataType> output_data_types_;
+  int num_outputs_;
-  uint32_t num_inputs_;
-  uint32_t num_outputs_;
  MACE_DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
 };

--- a/mace/core/runtime/hexagon/hexagon_device.h
+++ b/mace/core/runtime/hexagon/hexagon_device.h
@@ -15,18 +15,55 @@
 #ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_
 #define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_
+#include <memory>
+#include <utility>
 #include "mace/core/device.h"
+#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#ifdef MACE_ENABLE_HEXAGON
+#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h"
+#endif
+#ifdef MACE_ENABLE_HTA
+#include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h"
+#endif
 namespace mace {
 class HexagonDevice : public CPUDevice {
 public:
-  HexagonDevice() : CPUDevice(0, AFFINITY_NONE, false) {}
+  explicit HexagonDevice(DeviceType device_type)
+      : CPUDevice(0, AFFINITY_NONE, false),
+        device_type_(device_type) {}
  DeviceType device_type() const override {
-    return DeviceType::HEXAGON;
+    return device_type_;
  };
+ private:
+  DeviceType device_type_;
 };
+std::unique_ptr<HexagonControlWrapper> CreateHexagonControlWrapper(
+    DeviceType device_type) {
+  std::unique_ptr<HexagonControlWrapper> hexagon_controller;
+  switch (device_type) {
+#ifdef MACE_ENABLE_HEXAGON
+    case HEXAGON:
+      hexagon_controller = make_unique<HexagonDSPWrapper>();
+      break;
+#endif
+#ifdef MACE_ENABLE_HTA
+    case HTA:
+      hexagon_controller = make_unique<HexagonHTAWrapper>();
+      break;
+#endif
+    default:
+      LOG(FATAL) << "Not supported Hexagon device type: " << device_type;
+  }
+  return hexagon_controller;
+}
 }  // namespace mace
 #endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_
--- a/mace/core/runtime/hexagon/hexagon_nn_ops.h
+++ b/mace/core/runtime/hexagon/hexagon_nn_ops.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_OPS_H_
-#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_OPS_H_
 #include <string>
 #include <unordered_map>
@@ -57,4 +57,4 @@ class OpMap {
 };
 }  // namespace mace
-#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_OPS_H_
--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
@@ -12,26 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <sys/time.h>
 #include <algorithm>
 #include <iomanip>
+#include <memory>
 #include <thread>  // NOLINT(build/c++11)
 #include <vector>
 #include <unordered_map>
 #include <string>
 #include <utility>
-#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h"
-#include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
+#include "mace/core/runtime/hexagon/hexagon_dsp_ops.h"
 #include "mace/core/types.h"
+#include "mace/port/env.h"
-namespace {
+#include "mace/utils/memory.h"
-inline int64_t NowMicros() {
+#include "third_party/nnlib/hexagon_nn.h"
-  struct timeval tv;
-  gettimeofday(&tv, nullptr);
-  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-}
-}
 namespace mace {
@@ -92,33 +87,33 @@ std::string FloatToString(const FloatType v, const int32_t precision) {
 }
 }  // namespace
-int HexagonControlWrapper::GetVersion() {
+int HexagonDSPWrapper::GetVersion() {
  int version;
  MACE_CHECK(hexagon_nn_version(&version) == 0, "get version error");
  return version;
 }
-bool HexagonControlWrapper::Config() {
+bool HexagonDSPWrapper::Config() {
  LOG(INFO) << "Hexagon config";
  MACE_CHECK(hexagon_nn_set_powersave_level(0) == 0, "hexagon power error");
  MACE_CHECK(hexagon_nn_config() == 0, "hexagon config error");
  return true;
 }
-bool HexagonControlWrapper::Init() {
+bool HexagonDSPWrapper::Init() {
  LOG(INFO) << "Hexagon init";
  MACE_CHECK(hexagon_nn_init(&nn_id_) == 0, "hexagon_nn_init failed");
  ResetPerfInfo();
  return true;
 }
-bool HexagonControlWrapper::Finalize() {
+bool HexagonDSPWrapper::Finalize() {
  LOG(INFO) << "Hexagon finalize";
  return hexagon_nn_set_powersave_level(1) == 0;
 }
-bool HexagonControlWrapper::SetupGraph(const NetDef &net_def,
+bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def,
-                                       unsigned const char *model_data) {
+                                   unsigned const char *model_data) {
  LOG(INFO) << "Hexagon setup graph";
  int64_t t0 = NowMicros();
@@ -236,29 +231,35 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def,
  cached_outputs.clear();
  // input info
-  num_inputs_ = 0;
+  num_inputs_ = net_def.input_info_size();
-  for (const InputInfo &input_info : net_def.input_info()) {
+  input_info_.reserve(num_inputs_);
+  for (const InputOutputInfo &input_info : net_def.input_info()) {
    std::vector<index_t> input_shape(input_info.dims().begin(),
                                     input_info.dims().end());
    while (input_shape.size() < 4) {
      input_shape.insert(input_shape.begin(), 1);
    }
-    input_shapes_.push_back(input_shape);
+    input_info_.emplace_back(input_shape,
-    input_data_types_.push_back(input_info.data_type());
+                             input_info.data_type(),
-    num_inputs_ += 1;
+                             input_info.scale(),
+                             input_info.zero_point(),
+                             make_unique<Tensor>());
  }
  // output info
-  num_outputs_ = 0;
+  num_outputs_ = net_def.output_info_size();
-  for (const OutputInfo &output_info : net_def.output_info()) {
+  output_info_.reserve(num_outputs_);
+  for (const InputOutputInfo &output_info : net_def.output_info()) {
    std::vector<index_t> output_shape(output_info.dims().begin(),
                                      output_info.dims().end());
    while (output_shape.size() < 4) {
      output_shape.insert(output_shape.begin(), 1);
    }
-    output_shapes_.push_back(output_shape);
+    output_info_.emplace_back(output_shape,
-    output_data_types_.push_back(output_info.data_type());
+                              output_info.data_type(),
-    num_outputs_ += 1;
+                              output_info.scale(),
+                              output_info.zero_point(),
+                              make_unique<Tensor>());
    VLOG(1) << "OutputInfo: "
            << "\n\t shape: " << output_shape[0] << " " << output_shape[1]
            << " " << output_shape[2] << " " << output_shape[3]
@@ -276,14 +277,14 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def,
  return true;
 }
-bool HexagonControlWrapper::TeardownGraph() {
+bool HexagonDSPWrapper::TeardownGraph() {
  LOG(INFO) << "Hexagon teardown graph";
  return hexagon_nn_teardown(nn_id_) == 0;
 }
 #define MACE_PRINT_BUFSIZE (2 * 1024 * 1024)
-void HexagonControlWrapper::PrintLog() {
+void HexagonDSPWrapper::PrintLog() {
  char *buf;
  if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return;
  MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char *>(buf),
@@ -293,7 +294,7 @@ void HexagonControlWrapper::PrintLog() {
  delete[] buf;
 }
-void HexagonControlWrapper::PrintGraph() {
+void HexagonDSPWrapper::PrintGraph() {
  LOG(INFO) << "Print Graph";
  char *buf;
  if ((buf = new char[MACE_PRINT_BUFSIZE]) == NULL) return;
@@ -304,13 +305,13 @@ void HexagonControlWrapper::PrintGraph() {
  delete[] buf;
 }
-void HexagonControlWrapper::SetDebugLevel(int level) {
+void HexagonDSPWrapper::SetDebugLevel(int level) {
  LOG(INFO) << "Set debug level: " << level;
  MACE_CHECK(hexagon_nn_set_debug_level(nn_id_, level) == 0,
             "set debug level error");
 }
-void HexagonControlWrapper::GetPerfInfo() {
+void HexagonDSPWrapper::GetPerfInfo() {
  LOG(INFO) << "Get perf info";
  std::vector<hexagon_nn_perfinfo> perf_info(MACE_MAX_NODE);
  unsigned int n_items = 0;
@@ -385,20 +386,20 @@ void HexagonControlWrapper::GetPerfInfo() {
  LOG(INFO) << "total duration: " << std::fixed << total_duration;
 }
-void HexagonControlWrapper::ResetPerfInfo() {
+void HexagonDSPWrapper::ResetPerfInfo() {
  LOG(INFO) << "Reset perf info";
  MACE_CHECK(hexagon_nn_reset_perfinfo(nn_id_, NN_GRAPH_PERFEVENT_UTIME) == 0,
             "reset perf error");
 }
-bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
+bool HexagonDSPWrapper::ExecuteGraph(const Tensor &input_tensor,
-                                         Tensor *output_tensor) {
+                                     Tensor *output_tensor) {
  VLOG(2) << "Execute graph: " << nn_id_;
  // single input and single output
  MACE_ASSERT(num_inputs_ == 1, "Wrong inputs num");
  MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
-  output_tensor->SetDtype(output_data_types_[0]);
+  output_tensor->SetDtype(output_info_[0].data_type);
-  output_tensor->Resize(output_shapes_[0]);
+  output_tensor->Resize(output_info_[0].shape);
  std::vector<uint32_t> output_shape(4);
  uint32_t output_bytes;
  int res = hexagon_nn_execute(
@@ -418,10 +419,11 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
      &output_bytes);
  MACE_CHECK(res == 0, "execute error");
-  MACE_ASSERT(output_shape.size() == output_shapes_[0].size(),
+  MACE_ASSERT(output_shape.size() == output_info_[0].shape.size(),
              "wrong output shape inferred");
  for (size_t i = 0; i < output_shape.size(); ++i) {
-    MACE_ASSERT(static_cast<index_t>(output_shape[i]) == output_shapes_[0][i],
+    MACE_ASSERT(static_cast<index_t>(output_shape[i])
+                    == output_info_[0].shape[i],
                "wrong output shape inferred");
  }
  MACE_ASSERT(output_bytes == output_tensor->raw_size(),
@@ -429,7 +431,7 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
  return res == 0;
 }
-bool HexagonControlWrapper::ExecuteGraphNew(
+bool HexagonDSPWrapper::ExecuteGraphNew(
    const std::vector<Tensor *> &input_tensors,
    std::vector<Tensor *> *output_tensors) {
  VLOG(2) << "Execute graph new: " << nn_id_;
@@ -438,14 +440,15 @@ bool HexagonControlWrapper::ExecuteGraphNew(
  MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
  MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
-  std::vector<hexagon_nn_tensordef> inputs(num_inputs * NUM_METADATA);
+  std::vector<hexagon_nn_tensordef> inputs(num_inputs * kNumMetaData);
-  std::vector<hexagon_nn_tensordef> outputs(num_outputs * NUM_METADATA);
+  std::vector<hexagon_nn_tensordef> outputs(num_outputs * kNumMetaData);
  std::vector<InputOutputMetadata> input_metadata(num_inputs);
  std::vector<InputOutputMetadata> output_metadata(num_outputs);
+  // transform mace input to hexagon input
  for (size_t i = 0; i < num_inputs; ++i) {
    std::vector<index_t> input_shape = input_tensors[i]->shape();
-    size_t index = i * NUM_METADATA;
+    size_t index = i * kNumMetaData;
    inputs[index].batches = static_cast<uint32_t>(input_shape[0]);
    inputs[index].height = static_cast<uint32_t>(input_shape[1]);
    inputs[index].width = static_cast<uint32_t>(input_shape[2]);
@@ -453,8 +456,8 @@ bool HexagonControlWrapper::ExecuteGraphNew(
    inputs[index].data = const_cast<unsigned char *>(
        reinterpret_cast<const unsigned char *>(input_tensors[i]->raw_data()));
    inputs[index].dataLen = static_cast<int>(input_tensors[i]->raw_size());
-    inputs[index].data_valid_len = static_cast<uint32_t>(
+    inputs[index].data_valid_len =
-        input_tensors[i]->raw_size());
+        static_cast<uint32_t>(input_tensors[i]->raw_size());
    inputs[index].unused = 0;
    input_metadata[i].Init(.0f, .0f, 1);
    AddInputMetadata(input_metadata[i].min_val, &inputs[index + 1]);
@@ -462,38 +465,44 @@ bool HexagonControlWrapper::ExecuteGraphNew(
    AddInputMetadata(input_metadata[i].needs_quantization, &inputs[index + 3]);
  }
+  // transform mace output to hexagon output
  for (size_t i = 0; i < num_outputs; ++i) {
-    size_t index = i * NUM_METADATA;
+    size_t index = i * kNumMetaData;
-    (*output_tensors)[i]->SetDtype(output_data_types_[i]);
+    (*output_tensors)[i]->SetDtype(output_info_[i].data_type);
-    (*output_tensors)[i]->Resize(output_shapes_[i]);
+    (*output_tensors)[i]->Resize(output_info_[i].shape);
    outputs[index].data = reinterpret_cast<unsigned char *>(
        (*output_tensors)[i]->raw_mutable_data());
    outputs[index].dataLen = static_cast<int>((*output_tensors)[i]->raw_size());
    output_metadata[i].Init(.0f, .0f, 1);
    AddOutputMetadata(output_metadata[i].min_val, &outputs[index + 1]);
    AddOutputMetadata(output_metadata[i].max_val, &outputs[index + 2]);
    AddOutputMetadata(output_metadata[i].needs_quantization,
                      &outputs[index + 3]);
  }
+  // Execute graph
  int res = hexagon_nn_execute_new(nn_id_,
                                   inputs.data(),
-                                   num_inputs * NUM_METADATA,
+                                   num_inputs * kNumMetaData,
                                   outputs.data(),
-                                   num_outputs * NUM_METADATA);
+                                   num_outputs * kNumMetaData);
+  // handle hexagon output
  for (size_t i = 0; i < num_outputs; ++i) {
-    size_t index = i * NUM_METADATA;
+    size_t index = i * kNumMetaData;
    std::vector<uint32_t> output_shape{
        outputs[index].batches, outputs[index].height, outputs[index].width,
        outputs[index].depth};
-    MACE_ASSERT(output_shape.size() == output_shapes_[i].size(),
+    MACE_ASSERT(output_shape.size() == output_info_[i].shape.size(),
                "wrong output shape inferred");
    for (size_t j = 0; j < output_shape.size(); ++j) {
      MACE_ASSERT(static_cast<index_t>(output_shape[j])
-                      == output_shapes_[i][j],
+                      == output_info_[i].shape[j],
                  "wrong output shape inferred");
    }
    MACE_ASSERT(static_cast<index_t>(outputs[index].data_valid_len)
                    == (*output_tensors)[i]->raw_size(),
                "wrong output bytes inferred.");

--- a/mace/core/runtime/hexagon/quantize.h
+++ b/mace/core/runtime/hexagon/quantize.h
@@ -12,44 +12,40 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_CORE_RUNTIME_HEXAGON_QUANTIZE_H_
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_WRAPPER_H_
-#define MACE_CORE_RUNTIME_HEXAGON_QUANTIZE_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_WRAPPER_H_
+#include <vector>
+#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
 #include "mace/core/tensor.h"
+#include "mace/public/mace.h"
 namespace mace {
-class Quantizer {
+class HexagonDSPWrapper : public HexagonControlWrapper {
 public:
-  Quantizer() {}
+  HexagonDSPWrapper() = default;
-  ~Quantizer() {}
+  int GetVersion() override;
-  void Quantize(const Tensor &in_tensor,
+  bool Config() override;
-                Tensor *out_tensor,
+  bool Init() override;
-                float *min_out,
+  bool Finalize() override;
-                float *max_out);
+  bool SetupGraph(const NetDef &net_def,
-  void Quantize(const Tensor &in_tensor,
+                  const unsigned char *model_data) override;
-                const float min_in,
+  bool ExecuteGraph(const Tensor &input_tensor,
-                const float max_in,
+                    Tensor *output_tensor) override;
-                Tensor *out_tensor,
+  bool ExecuteGraphNew(const std::vector<Tensor *> &input_tensors,
-                float *min_out,
+                       std::vector<Tensor *> *output_tensors) override;
-                float *max_out);
+  bool TeardownGraph() override;
-  void DeQuantize(const Tensor &in_tensor,
+  void PrintLog() override;
-                  const float min_in,
+  void PrintGraph() override;
-                  const float max_in,
+  void GetPerfInfo() override;
-                  Tensor *out_tensor);
+  void ResetPerfInfo() override;
+  void SetDebugLevel(int level) override;
- private:
-  void QuantizeAdjustRange(float min_in,
+  MACE_DISABLE_COPY_AND_ASSIGN(HexagonDSPWrapper);
-                           float max_in,
-                           float *min_out,
-                           float *max_out,
-                           float *stepsize,
-                           float *recip_stepsize);
-  MACE_DISABLE_COPY_AND_ASSIGN(Quantizer);
 };
 }  // namespace mace
-#endif  // MACE_CORE_RUNTIME_HEXAGON_QUANTIZE_H_
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DSP_WRAPPER_H_
--- a/mace/core/runtime/hexagon/hexagon_hta_ops.h
+++ b/mace/core/runtime/hexagon/hexagon_hta_ops.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_OPS_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_OPS_H_
+#include <string>
+#include <unordered_map>
+#include "mace/utils/logging.h"
+#include "third_party/hta/hta_hexagon_nn_ops.h"
+namespace mace {
+class OpMap {
+ public:
+  void Init() {
+#define HTA_DEF_OP(NAME) op_map_[#NAME] = HTA_OP_##NAME;
+#include "third_party/hta/hta_ops.h"
+#undef HTA_DEF_OP
+  }
+  hta_op_type GetOpId(const std::string &op_type) {
+    if (op_map_.find(op_type) != end(op_map_)) {
+      return op_map_[op_type];
+    } else {
+      LOG(ERROR) << "HTA unsupported op type: " << op_type;
+      return HTA_NN_OPS_MAX;
+    }
+  }
+ private:
+  std::unordered_map<std::string, hta_op_type> op_map_;
+};
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_OPS_H_
--- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h"
+#include <algorithm>
+#include <iomanip>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <utility>
+#include "mace/core/runtime/hexagon/hexagon_hta_ops.h"
+#include "mace/core/types.h"
+#include "mace/utils/memory.h"
+#include "mace/utils/quantize.h"
+#include "third_party/hta/hta_hexagon_api.h"
+namespace mace {
+int HexagonHTAWrapper::GetVersion() {
+  int version;
+  MACE_CHECK(hexagon_hta_nn_version(&version) == 0, "get version error");
+  return version;
+}
+bool HexagonHTAWrapper::Config() {
+  LOG(INFO) << "HTA config";
+  MACE_CHECK(hexagon_hta_nn_config() == 0, "hexagon config error");
+  return true;
+}
+bool HexagonHTAWrapper::Init() {
+  LOG(INFO) << "Hexagon init";
+  MACE_CHECK(hexagon_hta_nn_init(&nn_id_) == 0, "hexagon_nn_init failed");
+  ResetPerfInfo();
+  return true;
+}
+bool HexagonHTAWrapper::Finalize() {
+  LOG(INFO) << "Hexagon finalize";
+  return true;
+}
+bool HexagonHTAWrapper::SetupGraph(const NetDef &net_def,
+                                   unsigned const char *model_data) {
+  LOG(INFO) << "Hexagon setup graph";
+  int64_t t0 = NowMicros();
+  // const node
+  for (const ConstTensor &const_tensor : net_def.tensors()) {
+    std::vector<int> tensor_shape(const_tensor.dims().begin(),
+                                  const_tensor.dims().end());
+    while (tensor_shape.size() < 4) {
+      tensor_shape.insert(tensor_shape.begin(), 1);
+    }
+    hexagon_nn_const_node const_node;
+    const_node.node_id = node_id(const_tensor.node_id());
+    const_node.tensor.batches = tensor_shape[0];
+    const_node.tensor.height = tensor_shape[1];
+    const_node.tensor.width = tensor_shape[2];
+    const_node.tensor.depth = tensor_shape[3];
+    if (const_tensor.data_type() == DataType::DT_INT32 &&
+        const_tensor.data_size() == 0) {
+      const_node.tensor.data = NULL;
+      const_node.tensor.dataLen = 0;
+    } else {
+      const_node.tensor.data =
+          const_cast<unsigned char *>(model_data + const_tensor.offset());
+      const_node.tensor.dataLen = const_tensor.data_size() *
+          GetEnumTypeSize(const_tensor.data_type());
+    }
+    hexagon_hta_nn_append_const_node(nn_id_,
+                                     const_node.node_id,
+                                     const_node.tensor.batches,
+                                     const_node.tensor.height,
+                                     const_node.tensor.width,
+                                     const_node.tensor.depth,
+                                     const_node.tensor.data,
+                                     const_node.tensor.dataLen);
+  }
+  // op node
+  OpMap op_map;
+  op_map.Init();
+  std::vector<std::vector<hexagon_hta_nn_input>> cached_inputs;
+  std::vector<std::vector<hexagon_hta_nn_output>> cached_outputs;
+  std::vector<hexagon_hta_nn_input> inputs;
+  std::vector<hexagon_hta_nn_output> outputs;
+  for (const OperatorDef &op : net_def.op()) {
+    hta_op_type op_id = op_map.GetOpId(op.type());
+    inputs.resize(op.node_input().size());
+    for (int i = 0; i < op.node_input().size(); ++i) {
+      inputs[i].src_id = node_id(op.node_input()[i].node_id());
+      inputs[i].output_idx = op.node_input()[i].output_port();
+    }
+    outputs.resize(op.output_shape().size());
+    for (int i = 0; i < op.output_shape().size(); ++i) {
+      outputs[i].rank = op.output_shape()[i].dims().size();
+      for (size_t j = 0; j < outputs[i].rank; ++j) {
+        outputs[i].max_sizes[j] = op.output_shape()[i].dims()[j];
+      }
+      if (outputs[i].rank == 0) {
+        outputs[i].rank = 1;
+        outputs[i].max_sizes[0] = 1;
+      }
+      outputs[i].max_sizes[outputs[i].rank] = 0;
+      outputs[i].elementsize = GetEnumTypeSize(
+          static_cast<DataType>(op.output_type()[i]));
+      outputs[i].zero_offset = 0;
+      outputs[i].stepsize = 0;
+    }
+    cached_inputs.push_back(inputs);
+    cached_outputs.push_back(outputs);
+    auto padding_type = static_cast<hta_padding_type>(op.padding());
+    hexagon_nn_op_node op_node;
+    op_node.node_id = node_id(op.node_id());
+    op_node.operation = op_id;
+    op_node.padding = padding_type;
+    op_node.inputs = cached_inputs.back().data();
+    op_node.inputsLen = inputs.size();
+    op_node.outputs = cached_outputs.back().data();
+    op_node.outputsLen = outputs.size();
+    hexagon_hta_nn_append_node(nn_id_,
+                               op_node.node_id,
+                               op_node.operation,
+                               op_node.padding,
+                               op_node.inputs,
+                               op_node.inputsLen,
+                               op_node.outputs,
+                               op_node.outputsLen);
+  }
+  // input info
+  num_inputs_ = net_def.input_info_size();
+  input_info_.reserve(num_inputs_);
+  for (const InputOutputInfo &input_info : net_def.input_info()) {
+    std::vector<index_t> input_shape(input_info.dims().begin(),
+                                     input_info.dims().end());
+    while (input_shape.size() < 4) {
+      input_shape.insert(input_shape.begin(), 1);
+    }
+    input_info_.emplace_back(input_shape,
+                             input_info.data_type(),
+                             input_info.scale(),
+                             input_info.zero_point(),
+                             make_unique<Tensor>());
+  }
+  // output info
+  num_outputs_ = net_def.output_info_size();
+  output_info_.reserve(num_outputs_);
+  for (const InputOutputInfo &output_info : net_def.output_info()) {
+    std::vector<index_t> output_shape(output_info.dims().begin(),
+                                      output_info.dims().end());
+    while (output_shape.size() < 4) {
+      output_shape.insert(output_shape.begin(), 1);
+    }
+    output_info_.emplace_back(output_shape,
+                              output_info.data_type(),
+                              output_info.scale(),
+                              output_info.zero_point(),
+                              make_unique<Tensor>());
+    VLOG(1) << "OutputInfo: "
+            << "\n\t shape: " << output_shape[0] << " " << output_shape[1]
+            << " " << output_shape[2] << " " << output_shape[3]
+            << "\n\t type: " << output_info.data_type();
+  }
+  int64_t t1 = NowMicros();
+  MACE_CHECK(hexagon_hta_nn_prepare(nn_id_) == 0, "hexagon_nn_prepare failed");
+  int64_t t2 = NowMicros();
+  VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1;
+  return true;
+}
+bool HexagonHTAWrapper::TeardownGraph() {
+  LOG(INFO) << "Hexagon teardown graph";
+  return hexagon_hta_nn_teardown(nn_id_) == 0;
+}
+void HexagonHTAWrapper::PrintLog() {
+  LOG(INFO) << "Print Log";
+}
+void HexagonHTAWrapper::PrintGraph() {
+  LOG(INFO) << "Print Graph";
+}
+void HexagonHTAWrapper::SetDebugLevel(int level) {
+  LOG(INFO) << "Set debug level: " << level;
+  MACE_CHECK(hexagon_hta_nn_set_debug_level(nn_id_, level) == 0,
+             "set debug level error");
+}
+void HexagonHTAWrapper::GetPerfInfo() {
+  LOG(INFO) << "Get perf info";
+}
+void HexagonHTAWrapper::ResetPerfInfo() {
+  LOG(INFO) << "Reset perf info";
+}
+bool HexagonHTAWrapper::ExecuteGraph(const Tensor &input_tensor,
+                                     Tensor *output_tensor) {
+  MACE_UNUSED(input_tensor);
+  MACE_UNUSED(output_tensor);
+  MACE_NOT_IMPLEMENTED;
+  return false;
+}
+bool HexagonHTAWrapper::ExecuteGraphNew(
+    const std::vector<Tensor *> &input_tensors,
+    std::vector<Tensor *> *output_tensors) {
+  VLOG(2) << "Execute graph new: " << nn_id_;
+  uint32_t num_inputs = static_cast<uint32_t>(input_tensors.size());
+  uint32_t num_outputs = static_cast<uint32_t>(output_tensors->size());
+  MACE_ASSERT(num_inputs_ == num_inputs, "Wrong inputs num");
+  MACE_ASSERT(num_outputs_ == num_outputs, "Wrong outputs num");
+  std::vector<hexagon_hta_nn_tensordef> inputs(num_inputs);
+  std::vector<hexagon_hta_nn_tensordef> outputs(num_outputs);
+  for (size_t i = 0; i < num_inputs; ++i) {
+    std::vector<index_t> input_shape = input_tensors[i]->shape();
+    inputs[i].batches = static_cast<uint32_t>(input_shape[0]);
+    inputs[i].height = static_cast<uint32_t>(input_shape[1]);
+    inputs[i].width = static_cast<uint32_t>(input_shape[2]);
+    inputs[i].depth = static_cast<uint32_t>(input_shape[3]);
+    input_info_[i].tensor_u8->SetDtype(DT_UINT8);
+    input_info_[i].tensor_u8->Resize(input_shape);
+    const float *input_data = input_tensors[i]->data<float>();
+    uint8_t *input_data_u8 = input_info_[i].tensor_u8->mutable_data<uint8_t>();
+    QuantizeWithScaleAndZeropoint(input_data,
+                                  input_tensors[i]->size(),
+                                  input_info_[i].scale,
+                                  input_info_[i].zero_point,
+                                  input_data_u8);
+    inputs[i].data = const_cast<unsigned char *>(
+        reinterpret_cast<const unsigned char *>(
+            input_info_[i].tensor_u8->raw_data()));
+    inputs[i].dataLen = static_cast<int>(input_info_[i].tensor_u8->raw_size());
+    inputs[i].data_valid_len = static_cast<uint32_t>(
+        input_info_[i].tensor_u8->raw_size());
+    inputs[i].unused = 0;
+  }
+  for (size_t i = 0; i < num_outputs; ++i) {
+    (*output_tensors)[i]->SetDtype(output_info_[i].data_type);
+    (*output_tensors)[i]->Resize(output_info_[i].shape);
+    output_info_[i].tensor_u8->SetDtype(DT_UINT8);
+    output_info_[i].tensor_u8->Resize(output_info_[i].shape);
+    outputs[i].data = reinterpret_cast<unsigned char *>(
+        output_info_[i].tensor_u8->raw_mutable_data());
+    outputs[i].dataLen =
+        static_cast<int>(output_info_[i].tensor_u8->raw_size());
+  }
+  int res = hexagon_hta_nn_execute_new(nn_id_,
+                                       inputs.data(),
+                                       num_inputs,
+                                       outputs.data(),
+                                       num_outputs);
+  for (size_t i = 0; i < num_outputs; ++i) {
+    std::vector<uint32_t> output_shape{
+        outputs[i].batches, outputs[i].height, outputs[i].width,
+        outputs[i].depth};
+    MACE_ASSERT(output_shape.size() == output_info_[i].shape.size(),
+                "wrong output shape inferred");
+    for (size_t j = 0; j < output_shape.size(); ++j) {
+      MACE_ASSERT(static_cast<index_t>(output_shape[j])
+                      == output_info_[i].shape[j],
+                  "wrong output shape inferred");
+    }
+    MACE_ASSERT(static_cast<index_t>(outputs[i].data_valid_len)
+                    == (*output_tensors)[i]->raw_size(),
+                "wrong output bytes inferred.");
+    const uint8_t *output_data_u8 = output_info_[i].tensor_u8->data<uint8_t>();
+    float *output_data = (*output_tensors)[i]->mutable_data<float>();
+    Dequantize(output_data_u8,
+               output_info_[i].tensor_u8->size(),
+               output_info_[i].scale,
+               output_info_[i].zero_point,
+               output_data);
+  }
+  return res == 0;
+}
+}  // namespace mace
--- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_
+#include <vector>
+#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include "mace/core/tensor.h"
+#include "mace/public/mace.h"
+namespace mace {
+class HexagonHTAWrapper : public HexagonControlWrapper {
+ public:
+  HexagonHTAWrapper() = default;
+  int GetVersion() override;
+  bool Config() override;
+  bool Init() override;
+  bool Finalize() override;
+  bool SetupGraph(const NetDef &net_def,
+                  const unsigned char *model_data) override;
+  bool ExecuteGraph(const Tensor &input_tensor,
+                    Tensor *output_tensor) override;
+  bool ExecuteGraphNew(const std::vector<Tensor *> &input_tensors,
+                       std::vector<Tensor *> *output_tensors) override;
+  bool TeardownGraph() override;
+  void PrintLog() override;
+  void PrintGraph() override;
+  void GetPerfInfo() override;
+  void ResetPerfInfo() override;
+  void SetDebugLevel(int level) override;
+  MACE_DISABLE_COPY_AND_ASSIGN(HexagonHTAWrapper);
+};
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_
--- a/mace/core/runtime/hexagon/quantize.cc
+++ b/mace/core/runtime/hexagon/quantize.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <algorithm>
-#include "mace/core/runtime/hexagon/quantize.h"
-namespace mace {
-void Quantizer::Quantize(const Tensor &in_tensor,
-                         Tensor *out_tensor,
-                         float *min_out,
-                         float *max_out) {
-  if (in_tensor.size() == 0) return;
-  const float *in_data = in_tensor.data<float>();
-  float min_in = in_data[0];
-  float max_in = in_data[0];
-  for (index_t i = 0; i < in_tensor.size(); ++i) {
-    min_in = std::min(min_in, in_data[i]);
-    max_in = std::max(max_in, in_data[i]);
-  }
-  Quantize(in_tensor, min_in, max_in, out_tensor, min_out, max_out);
-}
-void Quantizer::Quantize(const Tensor &in_tensor,
-                         const float min_in,
-                         const float max_in,
-                         Tensor *out_tensor,
-                         float *min_out,
-                         float *max_out) {
-  float stepsize;
-  float recip_stepsize;
-  QuantizeAdjustRange(min_in, max_in, min_out, max_out, &stepsize,
-                      &recip_stepsize);
-  const float *in = in_tensor.data<float>();
-  uint8_t *out = out_tensor->mutable_data<uint8_t>();
-  for (int i = 0; i < in_tensor.size(); i++) {
-    const float inval = in[i];
-    float ival =
-        static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
-    if (ival < 0) ival = 0;
-    if (ival > 255) ival = 255;
-    out[i] = static_cast<uint8_t>(ival);
-  }
-}
-void Quantizer::QuantizeAdjustRange(float min_in,
-                                    float max_in,
-                                    float *min_out,
-                                    float *max_out,
-                                    float *stepsize_out,
-                                    float *recip_stepsize_out) {
-  float minval = std::min(0.0f, min_in);
-  float maxval = std::max(0.0f, max_in);
-  float range = std::max(0.0001f, maxval - minval);
-  float recip_stepsize = 255.0f / range;
-  // make z(q0) integer
-  if (minval < 0.0f) {
-    float z = -minval * recip_stepsize;
-    float zi = floorf(z);
-    float zf = z - zi;
-    if (zf > 0.0001f && zf < 0.9999f) {
-      if (zi > 0.0f && (zi >= 254.0f || (zf - 1.0f) * minval > zf * maxval)) {
-        range = -255.0f * minval / zi;
-        maxval = minval + range;
-      } else {
-        range = 255.0f * maxval / (254.0f - zi);
-        minval = maxval - range;
-      }
-      recip_stepsize = 255.0f / range;
-    }
-  }
-  *min_out = minval;
-  *max_out = maxval;
-  *stepsize_out = range / 255.0f;
-  *recip_stepsize_out = recip_stepsize;
-}
-void Quantizer::DeQuantize(const Tensor &in_tensor,
-                           const float min_in,
-                           const float max_in,
-                           Tensor *out_tensor) {
-  float range = std::max(0.0001f, max_in - min_in);
-  float stepsize = range / 255.0f;
-  const uint8_t *in = in_tensor.data<uint8_t>();
-  float *out = out_tensor->mutable_data<float>();
-  for (int i = 0; i < out_tensor->size(); ++i) {
-    out[i] = (in[i] * stepsize) + min_in;
-  }
-}
-}  // namespace mace
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -24,7 +24,7 @@
 #include <vector>
 #include <utility>
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/core/kv_storage.h"
 #include "mace/core/runtime/opencl/opencl_extension.h"
 #include "mace/utils/tuner.h"
@@ -273,7 +273,7 @@ OpenCLRuntime::OpenCLRuntime(
    gpu_type_(UNKNOWN) {
  std::vector<cl::Platform> all_platforms;
  cl::Platform::get(&all_platforms);
-  if (all_platforms.size() == 0) {
+  if (all_platforms.empty()) {
    LOG(ERROR) << "No OpenCL platforms found";
    return;
  }
@@ -289,7 +289,7 @@ OpenCLRuntime::OpenCLRuntime(
  // get default device (CPUs, GPUs) of the default platform
  std::vector<cl::Device> all_devices;
  default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
-  if (all_devices.size() == 0) {
+  if (all_devices.empty()) {
    LOG(ERROR) << "No OpenCL devices found";
    return;
  }

--- a/mace/core/runtime/opencl/opencl_util.cc
+++ b/mace/core/runtime/opencl/opencl_util.cc
@@ -17,6 +17,7 @@
 #include <utility>
 #include "mace/utils/logging.h"
+#include "mace/utils/math.h"
 namespace mace {
@@ -151,8 +152,9 @@ std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
    const std::vector<mace::index_t> &input_shape,
    const std::string &output_name,
    const mace::DataType dt,
+    const OpenCLBufferType buffer_type,
    const mace::MemoryType mem_type,
-    const DataFormat data_format) {
+    bool has_data_format) {
  std::unique_ptr<OperatorDef> op(new OperatorDef);
  std::string op_name = "mace_node_" + output_name;
  op->set_name(op_name);
@@ -161,7 +163,7 @@ std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
  op->add_output(output_name);
  Argument *arg = op->add_arg();
  arg->set_name("buffer_type");
-  arg->set_i(static_cast<int32_t>(OpenCLBufferType::IN_OUT_CHANNEL));
+  arg->set_i(static_cast<int32_t>(buffer_type));
  arg = op->add_arg();
  arg->set_name("mem_type");
  arg->set_i(static_cast<int32_t>(mem_type));
@@ -169,8 +171,8 @@ std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
  arg->set_name("T");
  arg->set_i(static_cast<int32_t>(dt));
  arg = op->add_arg();
-  arg->set_name("data_format");
+  arg->set_name("has_data_format");
-  arg->set_i(data_format);
+  arg->set_i(has_data_format);
  if (!input_shape.empty()) {
    OutputShape *shape = op->add_output_shape();
    for (auto value : input_shape) {

--- a/mace/core/runtime/opencl/opencl_util.h
+++ b/mace/core/runtime/opencl/opencl_util.h
@@ -48,8 +48,9 @@ class OpenCLUtil {
      const std::vector<mace::index_t> &input_shape,
      const std::string &output_name,
      const mace::DataType dt,
+      const OpenCLBufferType buffer_type,
      const MemoryType mem_type,
-      const DataFormat data_format);
+      bool has_data_format);
 };
 }  // namespace mace

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -97,8 +97,6 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
 }
 }  // namespace numerical_chars
-enum FilterDataFormat { HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 };
 class Tensor {
 public:
  Tensor(Allocator *alloc, DataType type,
@@ -304,10 +302,14 @@ class Tensor {
    if (buffer_ != nullptr) {
      MACE_CHECK(!has_opencl_image(),
                 name_, ": Cannot resize image, use ResizeImage.");
-      if (raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE > buffer_->size()) {
+      const index_t apply_size = raw_size()
+          + ((buffer_ != &buffer_slice_) ? MACE_EXTRA_BUFFER_PAD_SIZE : 0);
+      if (apply_size > buffer_->size()) {
        LOG(WARNING) << name_ << ": Resize buffer from size " << buffer_->size()
-                     << " to " << raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE;
+                     << " to " << apply_size;
-        return buffer_->Resize(raw_size() + MACE_EXTRA_BUFFER_PAD_SIZE);
+        MACE_CHECK(buffer_ != &buffer_slice_,
+                   ": Cannot resize tensor with buffer slice");
+        return buffer_->Resize(apply_size);
      }
      return MaceStatus::MACE_SUCCESS;
    } else {

--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -20,7 +20,7 @@
 #include <vector>
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/utils/env_time.h"
+#include "mace/port/env.h"
 #include "mace/utils/logging.h"
 namespace mace {

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -68,7 +68,7 @@ const Tensor *Workspace::GetTensor(const std::string &name) const {
  if (tensor_map_.count(name)) {
    return tensor_map_.at(name).get();
  } else {
-    LOG(WARNING) << "Tensor " << name << " does not exist.";
+    VLOG(1) << "Tensor " << name << " does not exist.";
  }
  return nullptr;
 }
@@ -264,31 +264,35 @@ MaceStatus Workspace::PreallocateOutputTensor(
  bool is_quantize_model = IsQuantizedModel(net_def);
  for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
    std::unique_ptr<Tensor> tensor
-        (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.first),
+        (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.mem_id),
-                    tensor_mem.second.second,
+                    tensor_mem.second.data_type,
                    false, tensor_mem.first));
-    if (mem_blocks[tensor_mem.second.first].mem_type()
+    if (tensor_mem.second.has_data_format) {
-        == MemoryType::GPU_IMAGE) {
+      if (mem_blocks[tensor_mem.second.mem_id].mem_type()
-      VLOG(1) << "Tensor: " << tensor_mem.first
+          == MemoryType::GPU_IMAGE) {
-              << " Mem: " << tensor_mem.second.first
+        VLOG(1) << "Tensor: " << tensor_mem.first
-              << " Data type: " << tensor->dtype()
+                << " Mem: " << tensor_mem.second.mem_id
-              << " Image shape: "
+                << " Data type: " << tensor->dtype()
-              << tensor->UnderlyingBuffer()->shape()[0]
+                << " Image shape: "
-              << ", "
+                << tensor->UnderlyingBuffer()->shape()[0]
-              << tensor->UnderlyingBuffer()->shape()[1];
+                << ", "
-     tensor->set_data_format(DataFormat::NHWC);
+                << tensor->UnderlyingBuffer()->shape()[1];
-    } else {
-      VLOG(1) << "Tensor: " << tensor_mem.first
-              << " Mem: " << tensor_mem.second.first
-              << " Data type: " << tensor->dtype()
-              << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
-      if (mem_blocks[tensor_mem.second.first].mem_type()
-          == MemoryType::GPU_BUFFER ||
-          is_quantize_model) {
        tensor->set_data_format(DataFormat::NHWC);
      } else {
-        tensor->set_data_format(DataFormat::NCHW);
+        VLOG(1) << "Tensor: " << tensor_mem.first
+                << " Mem: " << tensor_mem.second.mem_id
+                << " Data type: " << tensor->dtype()
+                << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
+        if (mem_blocks[tensor_mem.second.mem_id].mem_type()
+            == MemoryType::GPU_BUFFER ||
+            is_quantize_model) {
+          tensor->set_data_format(DataFormat::NHWC);
+        } else {
+          tensor->set_data_format(DataFormat::NCHW);
+        }
      }
+    } else {
+      tensor->set_data_format(DataFormat::DF_NONE);
    }
    tensor_map_[tensor_mem.first] = std::move(tensor);
  }

--- a/mace/examples/android/README.md
+++ b/mace/examples/android/README.md
@@ -5,7 +5,7 @@ How to build
 ---------------
 ```sh
-cd mace/exampls/android
+cd mace/examples/android
 ./build.sh dynamic
 # if libmace.a is needed, update `macelibrary/CMakeLists.txt` and run with `./build.sh static`.
 ```

--- a/mace/examples/cli/BUILD
+++ b/mace/examples/cli/BUILD
@@ -3,6 +3,7 @@ load(
    "//mace:mace.bzl",
    "if_android",
    "if_hexagon_enabled",
+    "if_hta_enabled",
    "if_opencl_enabled",
    "if_openmp_enabled",
 )
@@ -33,8 +34,11 @@ cc_binary(
        "//mace/codegen:generated_libmace",
        "//mace/codegen:generated_opencl_binary",
        "//mace/codegen:generated_opencl_parameter",
+        "//mace/utils:utils_hdrs",
    ] + if_hexagon_enabled([
        "//third_party/nnlib:libhexagon",
+    ]) + if_hta_enabled([
+        "//third_party/hta",
    ]),
 )
@@ -63,5 +67,6 @@ cc_binary(
        "//mace/codegen:generated_mace_engine_factory",
        "//mace/codegen:generated_opencl_binary",
        "//mace/codegen:generated_opencl_parameter",
+        "//mace/utils:utils_hdrs",
    ],
 )
--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <dirent.h>
+#include <errno.h>
 #include <fcntl.h>
 #include <malloc.h>
 #include <sys/mman.h>
@@ -26,7 +27,11 @@
 #include <numeric>
 #include "gflags/gflags.h"
+#include "mace/port/env.h"
+#include "mace/port/file_system.h"
 #include "mace/public/mace.h"
+#include "mace/utils/logging.h"
+#include "mace/utils/string_util.h"
 // if convert model to code.
 #ifdef MODEL_GRAPH_FORMAT_CODE
 #include "mace/codegen/engine/mace_engine_factory.h"
@@ -45,97 +50,6 @@ size_t OpenCLParameterSize();
 namespace mace {
 namespace examples {
-namespace str_util {
-std::vector<std::string> Split(const std::string &str, char delims) {
-  std::vector<std::string> result;
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    size_t next_offset = tmp.find(delims);
-    result.push_back(tmp.substr(0, next_offset));
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-  return result;
-}
-}  // namespace str_util
-namespace {
-bool ReadBinaryFile(std::vector<unsigned char> *data,
-                           const std::string &filename) {
-  std::ifstream ifs(filename, std::ios::in | std::ios::binary);
-  if (!ifs.is_open()) {
-    return false;
-  }
-  ifs.seekg(0, ifs.end);
-  size_t length = ifs.tellg();
-  ifs.seekg(0, ifs.beg);
-  data->reserve(length);
-  data->insert(data->begin(), std::istreambuf_iterator<char>(ifs),
-               std::istreambuf_iterator<char>());
-  if (ifs.fail()) {
-    return false;
-  }
-  ifs.close();
-  return true;
-}
-bool MemoryMap(const std::string &file,
-               const unsigned char **data,
-               size_t *size) {
-  bool ret = true;
-  int fd = open(file.c_str(), O_RDONLY);
-  if (fd < 0) {
-    std::cerr << "Failed to open file " << file
-              << ", error code: " << strerror(errno) << std::endl;
-    ret = false;
-  }
-  struct stat st;
-  fstat(fd, &st);
-  *size = static_cast<size_t>(st.st_size);
-  *data = static_cast<const unsigned char *>(
-      mmap(nullptr, *size, PROT_READ, MAP_PRIVATE, fd, 0));
-  if (*data == static_cast<const unsigned char *>(MAP_FAILED)) {
-    std::cerr << "Failed to map file " << file
-              << ", error code: " << strerror(errno) << std::endl;
-    ret = false;
-  }
-  if (close(fd) < 0) {
-    std::cerr << "Failed to close file " << file
-              << ", error code: " << strerror(errno) << std::endl;
-    ret = false;
-  }
-  return ret;
-}
-bool MemoryUnMap(const unsigned char *data,
-                 const size_t &size) {
-  bool ret = true;
-  if (data == nullptr || size == 0) {
-    std::cerr << "data is null or size is 0" << std::endl;
-    ret = false;
-  }
-  if (munmap(const_cast<unsigned char *>(data), size) < 0) {
-    std::cerr << "Failed to unmap file, error code: "
-              << strerror(errno) << std::endl;
-    ret = false;
-  }
-  return ret;
-}
-}  // namespace
 void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
  std::string tmp = str;
  while (!tmp.empty()) {
@@ -165,11 +79,24 @@ DeviceType ParseDeviceType(const std::string &device_str) {
    return DeviceType::GPU;
  } else if (device_str.compare("HEXAGON") == 0) {
    return DeviceType::HEXAGON;
+  } else if (device_str.compare("HTA") == 0) {
+    return DeviceType::HTA;
  } else {
    return DeviceType::CPU;
  }
 }
+DataFormat ParseDataFormat(const std::string &data_format_str) {
+  if (data_format_str == "NHWC") {
+    return DataFormat::NHWC;
+  } else if (data_format_str == "NCHW") {
+    return DataFormat::NCHW;
+  } else if (data_format_str == "OIHW") {
+    return DataFormat::OIHW;
+  } else {
+    return DataFormat::DF_NONE;
+  }
+}
 DEFINE_string(model_name,
              "",
@@ -186,6 +113,12 @@ DEFINE_string(output_node,
 DEFINE_string(output_shape,
              "1,224,224,2:1,1,1,10",
              "output shapes, separated by colon and comma");
+DEFINE_string(input_data_format,
+              "NHWC",
+              "input data formats, NONE|NHWC|NCHW");
+DEFINE_string(output_data_format,
+              "NHWC",
+              "output data formats, NONE|NHWC|NCHW");
 DEFINE_string(input_file,
              "",
              "input file name | input file prefix for multiple inputs.");
@@ -222,8 +155,10 @@ DEFINE_int32(cpu_affinity_policy, 1,
 bool RunModel(const std::vector<std::string> &input_names,
              const std::vector<std::vector<int64_t>> &input_shapes,
+              const std::vector<DataFormat> &input_data_formats,
              const std::vector<std::string> &output_names,
-              const std::vector<std::vector<int64_t>> &output_shapes) {
+              const std::vector<std::vector<int64_t>> &output_shapes,
+              const std::vector<DataFormat> &output_data_formats) {
  // load model
  DeviceType device_type = ParseDeviceType(FLAGS_device);
  // configuration
@@ -266,16 +201,26 @@ bool RunModel(const std::vector<std::string> &input_names,
  std::shared_ptr<mace::MaceEngine> engine;
  MaceStatus create_engine_status;
-  std::vector<unsigned char> model_graph_data;
+  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_graph_data;
-  if (!ReadBinaryFile(&model_graph_data, FLAGS_model_file)) {
+  if (FLAGS_model_file != "") {
-    std::cerr << "Failed to read file: " << FLAGS_model_file << std::endl;
+    auto fs = GetFileSystem();
+    auto status = fs->NewReadOnlyMemoryRegionFromFile(FLAGS_model_file.c_str(),
+        &model_graph_data);
+    if (status != MaceStatus::MACE_SUCCESS) {
+      LOG(FATAL) << "Failed to read file: " << FLAGS_model_file;
+    }
  }
-  const unsigned char *model_weights_data = nullptr;
-  size_t model_weights_data_size = 0;
+  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_weights_data;
-  if (!MemoryMap(FLAGS_model_data_file,
+  if (FLAGS_model_data_file != "") {
-                 &model_weights_data,
+    auto fs = GetFileSystem();
-                 &model_weights_data_size)) {
+    auto status = fs->NewReadOnlyMemoryRegionFromFile(
-    std::cerr << "Failed to read file: " << FLAGS_model_data_file << std::endl;
+        FLAGS_model_data_file.c_str(),
+        &model_weights_data);
+    if (status != MaceStatus::MACE_SUCCESS) {
+      LOG(FATAL) << "Failed to read file: " << FLAGS_model_data_file;
+    }
+    MACE_CHECK(model_weights_data->length() > 0);
  }
  // Only choose one of the two type based on the `model_graph_format`
@@ -283,24 +228,24 @@ bool RunModel(const std::vector<std::string> &input_names,
 #ifdef MODEL_GRAPH_FORMAT_CODE
  // if model_data_format == code, just pass an empty string("")
  // to model_data_file parameter.
-  create_engine_status =
+  create_engine_status = CreateMaceEngineFromCode(
-      CreateMaceEngineFromCode(FLAGS_model_name,
+      FLAGS_model_name,
-                               model_weights_data,
+      reinterpret_cast<const unsigned char *>(model_weights_data->data()),
-                               model_weights_data_size,
+      model_weights_data->length(),
-                               input_names,
+      input_names,
-                               output_names,
+      output_names,
-                               config,
+      config,
-                               &engine);
+      &engine);
 #else
-  create_engine_status =
+  create_engine_status = CreateMaceEngineFromProto(
-      CreateMaceEngineFromProto(model_graph_data.data(),
+      reinterpret_cast<const unsigned char *>(model_graph_data->data()),
-                                model_graph_data.size(),
+      model_graph_data->length(),
-                                model_weights_data,
+      reinterpret_cast<const unsigned char *>(model_weights_data->data()),
-                                model_weights_data_size,
+      model_weights_data->length(),
-                                input_names,
+      input_names,
-                                output_names,
+      output_names,
-                                config,
+      config,
-                                &engine);
+      &engine);
 #endif
  if (create_engine_status != MaceStatus::MACE_SUCCESS) {
@@ -324,7 +269,8 @@ bool RunModel(const std::vector<std::string> &input_names,
    inputs_size[input_names[i]] = input_size;
    auto buffer_in = std::shared_ptr<float>(new float[input_size],
                                            std::default_delete<float[]>());
-    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in);
+    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in,
+        input_data_formats[i]);
  }
  for (size_t i = 0; i < output_count; ++i) {
@@ -333,7 +279,8 @@ bool RunModel(const std::vector<std::string> &input_names,
                        std::multiplies<int64_t>());
    auto buffer_out = std::shared_ptr<float>(new float[output_size],
                                             std::default_delete<float[]>());
-    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
+    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,
+        output_data_formats[i]);
  }
  if (!FLAGS_input_dir.empty()) {
@@ -430,10 +377,6 @@ bool RunModel(const std::vector<std::string> &input_names,
    }
  }
-  if (model_weights_data != nullptr) {
-    MemoryUnMap(model_weights_data, model_weights_data_size);
-  }
  std::cout << "Finished" << std::endl;
  return true;
@@ -466,13 +409,10 @@ int Main(int argc, char **argv) {
            << FLAGS_cpu_affinity_policy
            << std::endl;
-  std::vector<std::string> input_names = str_util::Split(FLAGS_input_node, ',');
+  std::vector<std::string> input_names = Split(FLAGS_input_node, ',');
-  std::vector<std::string> output_names =
+  std::vector<std::string> output_names = Split(FLAGS_output_node, ',');
-      str_util::Split(FLAGS_output_node, ',');
+  std::vector<std::string> input_shapes = Split(FLAGS_input_shape, ':');
-  std::vector<std::string> input_shapes =
+  std::vector<std::string> output_shapes = Split(FLAGS_output_shape, ':');
-      str_util::Split(FLAGS_input_shape, ':');
-  std::vector<std::string> output_shapes =
-      str_util::Split(FLAGS_output_shape, ':');
  const size_t input_count = input_shapes.size();
  const size_t output_count = output_shapes.size();
@@ -485,11 +425,25 @@ int Main(int argc, char **argv) {
    ParseShape(output_shapes[i], &output_shape_vec[i]);
  }
+  std::vector<std::string> raw_input_data_formats =
+    Split(FLAGS_input_data_format, ',');
+  std::vector<std::string> raw_output_data_formats =
+    Split(FLAGS_output_data_format, ',');
+  std::vector<DataFormat> input_data_formats(input_count);
+  std::vector<DataFormat> output_data_formats(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]);
+  }
  bool ret = false;
  for (int i = 0; i < FLAGS_restart_round; ++i) {
    std::cout << "restart round " << i << std::endl;
    ret =
-        RunModel(input_names, input_shape_vec, output_names, output_shape_vec);
+        RunModel(input_names, input_shape_vec, input_data_formats,
+                 output_names, output_shape_vec, output_data_formats);
  }
  if (ret) {
    return 0;

--- a/mace/libmace/BUILD
+++ b/mace/libmace/BUILD
@@ -10,13 +10,14 @@ licenses(["notice"])  # Apache 2.0
 load(
    "//mace:mace.bzl",
    "if_android",
+    "if_linux",
+    "if_darwin",
    "if_neon_enabled",
-    "if_neon_enabled_str",
    "if_openmp_enabled",
    "if_android_armv7",
    "if_hexagon_enabled",
+    "if_hta_enabled",
    "if_opencl_enabled",
-    "if_opencl_enabled_str",
    "if_quantize_enabled",
 )
@@ -40,6 +41,8 @@ cc_library(
        "-DMACE_ENABLE_QUANTIZE",
    ]) + if_hexagon_enabled([
        "-DMACE_ENABLE_HEXAGON",
+    ]) + if_hta_enabled([
+        "-DMACE_ENABLE_HTA",
    ]),
    deps = [
        "//mace/ops",
@@ -77,6 +80,7 @@ cc_library(
    visibility = ["//visibility:public"],
 )
+# For details, see https://github.com/bazelbuild/bazel/issues/5200
 genrule(
    name = "libmace_static",
    srcs = [
@@ -87,10 +91,19 @@ genrule(
        "//mace/ops:internal_ops",
        "//mace/ops",
        "//mace/libmace",
+        "//mace/port:port_base",
+        "//mace/port/posix:port_posix",
+        "//mace/public",
        "//mace/utils",
        "//mace/proto:mace_cc",
        "@com_google_protobuf//:protobuf_lite",
-    ] + if_opencl_enabled([
+    ] + if_android([
+        "//mace/port/android:port_android",
+    ]) + if_linux([
+        "//mace/port/linux:port_linux",
+    ]) + if_darwin([
+        "//mace/port/darwin:port_darwin",
+    ]) + if_opencl_enabled([
        "//mace/ops:opencl_kernels",
        "//mace/codegen:generated_opencl",
    ]) + if_neon_enabled([
@@ -103,20 +116,44 @@ genrule(
          "$(locations //mace/core:core) " +
          "$(locations //mace/ops:common) " +
          "$(locations //mace/ops:ref_kernels) " +
-          if_neon_enabled_str("$(locations //mace/ops:arm_neon_kernels) ") +
+          if_neon_enabled(
-          if_opencl_enabled_str("$(locations //mace/ops:opencl_kernels) ") +
+              "$(locations //mace/ops:arm_neon_kernels) ",
+              default_value = "",
+          ) +
+          if_opencl_enabled(
+              "$(locations //mace/ops:opencl_kernels) ",
+              default_value = "",
+          ) +
          "$(locations //mace/ops:internal_ops) " +
          "$(locations //mace/ops:ops) " +
          "$(locations //mace/libmace:libmace) " +
+          "$(locations //mace/port:port_base) " +
+          "$(locations //mace/port/posix:port_posix) " +
+          if_android(
+              "$(locations //mace/port/android:port_android) ",
+              default_value = "",
+          ) +
+          if_linux(
+              "$(locations //mace/port/linux:port_linux) ",
+              default_value = "",
+          ) +
+          if_darwin(
+              "$(locations //mace/port/darwin:port_darwin) ",
+              default_value = "",
+          ) +
+          "$(locations //mace/public:public) " +
          "$(locations //mace/utils:utils) " +
          "$(locations //mace/proto:mace_cc) " +
          "$(locations @com_google_protobuf//:protobuf_lite) " +
-          if_opencl_enabled_str("$(locations //mace/codegen:generated_opencl) ") +
+          if_opencl_enabled(
+              "$(locations //mace/codegen:generated_opencl) ",
+              default_value = "",
+          ) +
          "$@ " +
          "$$tmp_mri_file);" +
          "$(AR) -M <$$tmp_mri_file;" +
-          "rm -rf $$tmp_mri_file;" +
+          "rm -rf $$tmp_mri_file;",
-          "$(STRIP) -x $@;",
+    # "$(STRIP) -x $@;",  # FIXME this will crash
    tools = ["//mace/python/tools:archive_static_lib"],
    visibility = ["//visibility:public"],
 )
--- a/mace/libmace/capability.cc
+++ b/mace/libmace/capability.cc
@@ -142,14 +142,15 @@ void BMNet::SetUp() {
  // Add input and output information
  for (size_t i = 0; i < input_names_.size(); ++i) {
-    InputInfo *info = net_.add_input_info();
+    InputOutputInfo *info = net_.add_input_info();
+    info->set_data_format(DataFormat::NHWC);
    info->set_name(input_names_[i]);
    for (auto d : input_shapes_[i]) {
      info->add_dims(static_cast<int>(d));
    }
  }
  for (auto output_name : output_names_) {
-    OutputInfo *info = net_.add_output_info();
+    InputOutputInfo *info = net_.add_output_info();
    info->set_name(output_name);
  }
  // allocate weight data
@@ -243,8 +244,8 @@ void BMNet::AddConv(const std::string &conv_type,
  op_def->add_output(output_name);
  AddIntsArg(op_def, "strides", strides);
  AddIntArg(op_def, "padding", padding_type);
+  AddIntArg(op_def, "has_data_format", 1);
  AddIntArg(op_def, "T", DT_HALF);
-  AddIntArg(op_def, "data_format", 1);
  if (has_relu6) {
    AddStringArg(op_def, "activation", "RELUX");
    AddFloatArg(op_def, "max_limit", 6);
@@ -270,7 +271,7 @@ void BMNet::AddEltwise(const std::string &op_name,
  op_def->add_output(output);
  AddIntArg(op_def, "type", type);
  AddIntArg(op_def, "T", DT_HALF);
-  AddIntArg(op_def, "data_format", 1);
+  AddIntArg(op_def, "has_data_format", 1);
  OutputShape *shape = op_def->add_output_shape();
  for (auto dim : output_shape) {
    shape->add_dims(dim);

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -21,17 +21,21 @@
 #include "mace/core/net.h"
 #include "mace/ops/ops_registry.h"
 #include "mace/ops/common/transpose.h"
+#include "mace/utils/math.h"
+#include "mace/utils/memory.h"
+#include "mace/utils/stl_util.h"
 #include "mace/public/mace.h"
+#include "mace/port/env.h"
+#include "mace/port/file_system.h"
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/core/runtime/opencl/gpu_device.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #endif  // MACE_ENABLE_OPENCL
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
 #include "mace/core/runtime/hexagon/hexagon_device.h"
-#endif  // MACE_ENABLE_HEXAGON
+#endif
 namespace mace {
 namespace {
@@ -289,7 +293,10 @@ MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
                       std::shared_ptr<float> data,
                       const DataFormat format) {
  MACE_CHECK_NOTNULL(data.get());
-  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
+  MACE_CHECK(format == DataFormat::NHWC || format == DataFormat::NCHW
+                 || format == OIHW,
+             "MACE only support NHWC, NCHW and OIHW formats of input now.");
+  impl_ = make_unique<MaceTensor::Impl>();
  impl_->shape = shape;
  impl_->data = data;
  impl_->format = format;
@@ -298,11 +305,11 @@ MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
 }
 MaceTensor::MaceTensor() {
-  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
+  impl_ = make_unique<MaceTensor::Impl>();
 }
 MaceTensor::MaceTensor(const MaceTensor &other) {
-  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
+  impl_ = make_unique<MaceTensor::Impl>();
  impl_->shape = other.shape();
  impl_->data = other.data();
  impl_->format = other.data_format();
@@ -310,7 +317,7 @@ MaceTensor::MaceTensor(const MaceTensor &other) {
 }
 MaceTensor::MaceTensor(const MaceTensor &&other) {
-  impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
+  impl_ = make_unique<MaceTensor::Impl>();
  impl_->shape = other.shape();
  impl_->data = other.data();
  impl_->format = other.data_format();
@@ -375,33 +382,31 @@ class MaceEngine::Impl {
                             std::pair<const std::string, MaceTensor> *output);
 private:
-  const unsigned char *model_data_;
+  std::unique_ptr<port::ReadOnlyMemoryRegion> model_data_;
-  size_t model_data_size_;
  std::unique_ptr<OpRegistryBase> op_registry_;
  DeviceType device_type_;
  std::unique_ptr<Device> device_;
  std::unique_ptr<Workspace> ws_;
  std::unique_ptr<NetBase> net_;
  bool is_quantized_model_;
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
  std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
 #endif
-  std::map<std::string, mace::InputInfo> input_info_map_;
+  std::map<std::string, mace::InputOutputInfo> input_info_map_;
-  std::map<std::string, mace::OutputInfo> output_info_map_;
+  std::map<std::string, mace::InputOutputInfo> output_info_map_;
  MACE_DISABLE_COPY_AND_ASSIGN(Impl);
 };
 MaceEngine::Impl::Impl(const MaceEngineConfig &config)
    : model_data_(nullptr),
-      model_data_size_(0),
      op_registry_(new OpRegistry),
      device_type_(config.impl_->device_type()),
      device_(nullptr),
      ws_(new Workspace()),
      net_(nullptr),
      is_quantized_model_(false)
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
      , hexagon_controller_(nullptr)
 #endif
 {
@@ -424,9 +429,9 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
        config.impl_->use_gemmlowp()));
  }
 #endif
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-  if (device_type_ == DeviceType::HEXAGON) {
+  if (device_type_ == DeviceType::HEXAGON || device_type_ == DeviceType::HTA) {
-    device_.reset(new HexagonDevice());
+    device_.reset(new HexagonDevice(device_type_));
  }
 #endif
  MACE_CHECK_NOTNULL(device_);
@@ -468,6 +473,9 @@ MaceStatus MaceEngine::Impl::Init(
      shape[i] = input_info_map_[input_name].dims(i);
    }
    input_tensor->Resize(shape);
+    // Set to the default data format
+    input_tensor->set_data_format(static_cast<DataFormat>(
+        input_info_map_[input_name].data_format()));
  }
  for (auto output_name : output_nodes) {
    if (output_info_map_.find(output_name) == output_info_map_.end()) {
@@ -475,15 +483,17 @@ MaceStatus MaceEngine::Impl::Init(
                 << "' does not belong to model's outputs "
                 << MakeString(MapKeys(output_info_map_));
    }
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
    ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
+#endif
  }
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-  if (device_type_ == HEXAGON) {
+  if (device_type_ == HEXAGON || device_type_ == HTA) {
-    hexagon_controller_.reset(new HexagonControlWrapper());
+    hexagon_controller_ = CreateHexagonControlWrapper(device_type_);
    MACE_CHECK(hexagon_controller_->Config(), "hexagon config error");
    MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
    hexagon_controller_->SetDebugLevel(
-        static_cast<int>(mace::logging::LogMessage::MinVLogLevel()));
+        static_cast<int>(mace::port::MinVLogLevelFromEnv()));
    MACE_CHECK(hexagon_controller_->SetupGraph(*net_def, model_data),
               "hexagon setup graph error");
    if (VLOG_IS_ON(2)) {
@@ -511,7 +521,7 @@ MaceStatus MaceEngine::Impl::Init(
      ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
    }
    MACE_RETURN_IF_ERROR(net_->Init());
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
  }
 #endif
@@ -525,25 +535,25 @@ MaceStatus MaceEngine::Impl::Init(
    const std::string &model_data_file) {
  LOG(INFO) << "Loading Model Data";
-  MemoryMap(model_data_file, &model_data_, &model_data_size_);
+  auto fs = GetFileSystem();
+  MACE_RETURN_IF_ERROR(fs->NewReadOnlyMemoryRegionFromFile(
+        model_data_file.c_str(), &model_data_));
-  MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes, model_data_));
+  MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes,
+        reinterpret_cast<const unsigned char *>(model_data_->data())));
  if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON ||
+      device_type_ == DeviceType::HTA ||
      (device_type_ == DeviceType::CPU && ws_->diffused_buffer())) {
-    MemoryUnMap(model_data_, model_data_size_);
+    model_data_.reset();
-    model_data_ = nullptr;
  }
  return MaceStatus::MACE_SUCCESS;
 }
 MaceEngine::Impl::~Impl() {
  LOG(INFO) << "Destroying MaceEngine";
-  if (model_data_ != nullptr) {
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-    MemoryUnMap(model_data_, model_data_size_);
+  if (device_type_ == HEXAGON || device_type_ == HTA) {
-  }
-#ifdef MACE_ENABLE_HEXAGON
-  if (device_type_ == HEXAGON) {
    if (VLOG_IS_ON(2)) {
      hexagon_controller_->GetPerfInfo();
      hexagon_controller_->PrintLog();
@@ -557,47 +567,51 @@ MaceEngine::Impl::~Impl() {
 MaceStatus MaceEngine::Impl::TransposeInput(
    const std::pair<const std::string, MaceTensor> &input,
    Tensor *input_tensor) {
-  if (device_->device_type() == DeviceType::CPU &&
+  bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE;
-      input.second.shape().size() == 4 &&
+  DataFormat data_format = DataFormat::DF_NONE;
-      input.second.data_format() == NHWC &&
+  if (has_data_format) {
-      !is_quantized_model_) {
+    if (device_->device_type() == DeviceType::CPU &&
-    VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
+        input.second.shape().size() == 4 &&
-    input_tensor->set_data_format(DataFormat::NCHW);
+        input.second.data_format() == NHWC &&
-    std::vector<int> dst_dims = {0, 3, 1, 2};
+        !is_quantized_model_) {
-    std::vector<index_t> output_shape =
+      VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
-        TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
+      input_tensor->set_data_format(DataFormat::NCHW);
-    MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
+      std::vector<int> dst_dims = {0, 3, 1, 2};
-    Tensor::MappingGuard input_guard(input_tensor);
+      std::vector<index_t> output_shape =
-    float *input_data = input_tensor->mutable_data<float>();
+          TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
-    return ops::Transpose(input.second.data().get(),
+      MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
-                          input.second.shape(),
+      Tensor::MappingGuard input_guard(input_tensor);
-                          dst_dims,
+      float *input_data = input_tensor->mutable_data<float>();
-                          input_data);
+      return ops::Transpose(input.second.data().get(),
-  } else if (
+                            input.second.shape(),
-      (is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
+                            dst_dims,
-      input.second.shape().size() == 4 &&
+                            input_data);
-      input.second.data_format() == DataFormat::NCHW) {
+    } else if (
-    VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
+        (is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
-    std::vector<int> dst_dims = {0, 2, 3, 1};
+            input.second.shape().size() == 4 &&
-    input_tensor->set_data_format(DataFormat::NHWC);
+            input.second.data_format() == DataFormat::NCHW) {
-    std::vector<index_t> output_shape =
+      VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
-        TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
+      std::vector<int> dst_dims = {0, 2, 3, 1};
-    MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
+      input_tensor->set_data_format(DataFormat::NHWC);
-    Tensor::MappingGuard input_guard(input_tensor);
+      std::vector<index_t> output_shape =
-    float *input_data = input_tensor->mutable_data<float>();
+          TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
-    return ops::Transpose(input.second.data().get(),
+      MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
-                          input.second.shape(),
+      Tensor::MappingGuard input_guard(input_tensor);
-                          dst_dims,
+      float *input_data = input_tensor->mutable_data<float>();
-                          input_data);
+      return ops::Transpose(input.second.data().get(),
-  } else {
+                            input.second.shape(),
-    input_tensor->set_data_format(input.second.data_format());
+                            dst_dims,
-    MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
+                            input_data);
-    Tensor::MappingGuard input_guard(input_tensor);
+    }
-    float *input_data = input_tensor->mutable_data<float>();
+    data_format = input.second.data_format();
-    memcpy(input_data, input.second.data().get(),
-           input_tensor->size() * sizeof(float));
-    return MaceStatus::MACE_SUCCESS;
  }
+  input_tensor->set_data_format(data_format);
+  MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
+  Tensor::MappingGuard input_guard(input_tensor);
+  float *input_data = input_tensor->mutable_data<float>();
+  memcpy(input_data, input.second.data().get(),
+         input_tensor->size() * sizeof(float));
+  return MaceStatus::MACE_SUCCESS;
 }
 MaceStatus MaceEngine::Impl::TransposeOutput(
@@ -605,38 +619,28 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
    std::pair<const std::string, mace::MaceTensor> *output) {
  // save output
  if (output_tensor != nullptr && output->second.data() != nullptr) {
-    if (device_->device_type() == DeviceType::CPU &&
+    if (output_tensor->data_format() != DataFormat::DF_NONE &&
-        output->second.shape().size() == 4 &&
+        output->second.data_format() != DataFormat::DF_NONE &&
-        output->second.data_format() != output_tensor->data_format()) {
-      MACE_CHECK(output_tensor->data_format() == NCHW);
-      VLOG(1) << "Transform output " << output->first << " from NCHW to NHWC";
-      std::vector<int> dst_dims = {0, 2, 3, 1};
-      std::vector<index_t> shape =
-          TransposeShape<index_t, index_t>(output_tensor->shape(),
-                                           dst_dims);
-      int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
-                                            std::multiplies<int64_t>());
-      MACE_CHECK(output_size <= output->second.impl_->buffer_size)
-        << "Output size exceeds buffer size: shape"
-        << MakeString<int64_t>(shape) << " vs buffer size "
-        << output->second.impl_->buffer_size;
-      output->second.impl_->shape = shape;
-      Tensor::MappingGuard output_guard(output_tensor);
-      const float *output_data = output_tensor->data<float>();
-      return ops::Transpose(output_data,
-                            output_tensor->shape(),
-                            dst_dims,
-                            output->second.data().get());
-    } else if (device_->device_type() == DeviceType::GPU &&
        output->second.shape().size() == 4 &&
        output->second.data_format() != output_tensor->data_format()) {
      VLOG(1) << "Transform output " << output->first << " from "
              << output_tensor->data_format() << " to "
              << output->second.data_format();
-      std::vector<int> dst_dims = {0, 3, 1, 2};
+      std::vector<int> dst_dims;
-      if (output_tensor->data_format() == NCHW) {
+      if (output_tensor->data_format() == NCHW &&
+          output->second.data_format() == NHWC) {
        dst_dims = {0, 2, 3, 1};
+      } else if (output_tensor->data_format() == NHWC &&
+          output->second.data_format() == NCHW) {
+        dst_dims = {0, 3, 1, 2};
+      } else {
+        LOG(FATAL) <<"Not supported output data format: "
+                   << output->second.data_format() << " vs "
+                   << output_tensor->data_format();
      }
+      VLOG(1) << "Transform output " << output->first << " from "
+              << output_tensor->data_format() << " to "
+              << output->second.data_format();
      std::vector<index_t> shape =
          TransposeShape<index_t, index_t>(output_tensor->shape(),
                                           dst_dims);
@@ -698,15 +702,15 @@ MaceStatus MaceEngine::Impl::Run(
    Tensor *output_tensor = ws_->GetTensor(output.first);
    output_tensors.push_back(output_tensor);
  }
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-  if (device_type_ == HEXAGON) {
+  if (device_type_ == HEXAGON || device_type_ == HTA) {
    MACE_CHECK(input_tensors.size() == 1 && output_tensors.size() == 1,
               "HEXAGON not support multiple inputs and outputs yet.");
    hexagon_controller_->ExecuteGraphNew(input_tensors, &output_tensors);
  } else {
 #endif
    MACE_RETURN_IF_ERROR(net_->Run(run_metadata));
-#ifdef MACE_ENABLE_HEXAGON
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
  }
 #endif
@@ -725,7 +729,7 @@ MaceStatus MaceEngine::Impl::Run(
 }
 MaceEngine::MaceEngine(const MaceEngineConfig &config):
-    impl_(new MaceEngine::Impl(config)) {}
+    impl_(make_unique<MaceEngine::Impl>(config)) {}
 MaceEngine::~MaceEngine() = default;

--- a/mace/libmace/mace_version_script.lds
+++ b/mace/libmace/mace_version_script.lds
@@ -7,19 +7,20 @@ mace {
    *CreateMaceEngineFromProto*;
    *GetBigLittleCoreIDs*;
    *MaceVersion*;
+    *GetCapability*;
    # api for static library of models
-    *mace*logging*LogMessage*;
+    *mace*port**;
    *mace*MaceStatus*;
    *mace*NetDef*;
    *mace*MemoryType*;
    *mace*DataType*;
-    *mace*InputInfo*;
+    *mace*InputOutputInfo*;
-    *mace*OutputInfo*;
    *mace*OutputShape*;
    *mace*OperatorDef*;
    *mace*ConstTensor*;
    *mace*Argument*;
+    *mace*Split*;
    *mace*MemoryBlock*;
    *google*protobuf*;

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
 # -*- Python -*-
-def if_android(a):
+def if_android(a, default_value = []):
  return select({
      "//mace:android": a,
-      "//conditions:default": [],
+      "//conditions:default": default_value,
  })
-def if_not_android(a):
+def if_linux(a, default_value = []):
  return select({
-      "//mace:android": [],
+      "//mace:linux": a,
-      "//conditions:default": a,
+      "//conditions:default": default_value,
+  })
+def if_darwin(a, default_value = []):
+  return select({
+      "//mace:darwin": a,
+      "//conditions:default": default_value,
  })
 def if_android_armv7(a):
@@ -36,16 +42,10 @@ def if_arm_linux_armhf(a):
      "//conditions:default": []
  })
-def if_neon_enabled(a):
+def if_neon_enabled(a, default_value = []):
-  return select({
-      "//mace:neon_enabled": a,
-      "//conditions:default": [],
-  })
-def if_neon_enabled_str(a):
  return select({
      "//mace:neon_enabled": a,
-      "//conditions:default": "",
+      "//conditions:default": default_value,
  })
 def if_hexagon_enabled(a):
@@ -60,22 +60,29 @@ def if_not_hexagon_enabled(a):
      "//conditions:default": a,
  })
-def if_openmp_enabled(a):
+def if_hta_enabled(a):
  return select({
-      "//mace:openmp_enabled": a,
+      "//mace:hta_enabled": a,
      "//conditions:default": [],
  })
-def if_opencl_enabled(a):
+def if_hexagon_or_hta_enabled(a):
  return select({
-      "//mace:opencl_enabled": a,
+      "//mace:hexagon_enabled": a,
+      "//mace:hta_enabled": a,
+      "//conditions:default": [],
+  })
+def if_openmp_enabled(a):
+  return select({
+      "//mace:openmp_enabled": a,
      "//conditions:default": [],
  })
-def if_opencl_enabled_str(a):
+def if_opencl_enabled(a, default_value = []):
  return select({
      "//mace:opencl_enabled": a,
-      "//conditions:default": "",
+      "//conditions:default": default_value,
  })
 def if_quantize_enabled(a):

--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -54,37 +54,17 @@ cc_library(
 cc_library(
    name = "testing",
-    srcs = glob(
+    hdrs = [
-        [
+            "testing/test_utils.h",
-            "testing/*.cc",
+    ],
-        ],
-    ),
-    hdrs = glob(
-        [
-            "testing/*.h",
-        ],
-    ),
    copts = [
        "-Werror",
        "-Wextra",
        "-Wno-missing-field-initializers",
-    ] + if_openmp_enabled([
+    ],
-        "-fopenmp",
-    ]) + if_neon_enabled([
-        "-DMACE_ENABLE_NEON",
-    ]) + if_android_armv7([
-        "-mfpu=neon",
-        "-mfloat-abi=softfp",
-    ]) + if_opencl_enabled([
-        "-DMACE_ENABLE_OPENCL",
-    ]) + if_quantize_enabled([
-        "-DMACE_ENABLE_QUANTIZE",
-    ]) + if_hexagon_enabled([
-        "-DMACE_ENABLE_HEXAGON",
-    ]),
    deps = [
        "//mace/core",
-        "@gtest//:gtest",
+        "@gtest",
    ],
 )
@@ -254,7 +234,7 @@ cc_library(
        ":arm_neon_kernels",
        ":ref_kernels",
        ":testing",
-        "@gtest//:gtest",
+        "@gtest",
    ],
    alwayslink = 1,
 )
@@ -289,7 +269,7 @@ cc_library(
        ":opencl_kernels",
        ":ref_kernels",
        ":testing",
-        "@gtest//:gtest",
+        "@gtest",
    ],
    alwayslink = 1,
 )
@@ -329,12 +309,12 @@ cc_library(
            "ops_registry.h",
            "ops_test_util.h",
            "fixpoint.h",
-            "gemmlowp_util.h",
+            "common/gemmlowp_util.h",
            "quantization_util.h",
        ],
    ) + if_quantize_enabled(glob([
        "fixpoint.h",
-        "gemmlowp_util.h",
+        "common/gemmlowp_util.h",
        "quantization_util.h",
    ])),
    copts = [

--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -22,6 +22,7 @@
 #include "mace/ops/opencl/buffer_transformer.h"
 #include "mace/ops/opencl/image/activation.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 namespace mace {
 namespace ops {
@@ -88,9 +89,8 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
    MemoryType mem_type;
    if (context->device()->gpu_runtime()->UseImageMemory()) {
      mem_type = MemoryType::GPU_IMAGE;
-      kernel_.reset(
+      kernel_ = make_unique<opencl::image::ActivationKernel<T>>(
-          new opencl::image::ActivationKernel<T>(type, relux_max_limit,
+          type, relux_max_limit, leakyrelu_coefficient);
-                                                 leakyrelu_coefficient));
    } else {
      MACE_NOT_IMPLEMENTED;
    }

--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -24,6 +24,7 @@
 #ifdef MACE_ENABLE_OPENCL
 #include "mace/ops/opencl/image/addn.h"
 #endif  // MACE_ENABLE_OPENCL
+#include "mace/utils/memory.h"
 namespace mace {
 namespace ops {
@@ -107,7 +108,7 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
  explicit AddNOp(OpConstructContext *context)
      : Operation(context) {
    if (context->device()->gpu_runtime()->UseImageMemory()) {
-      kernel_.reset(new opencl::image::AddNKernel<T>);
+      kernel_ = make_unique<opencl::image::AddNKernel<T>>();
    } else {
      MACE_NOT_IMPLEMENTED;
    }

--- a/mace/ops/arm/activation_neon.cc
+++ b/mace/ops/arm/activation_neon.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/activation_neon.h
+++ b/mace/ops/arm/activation_neon.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/common_neon.h
+++ b/mace/ops/arm/common_neon.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/conv_2d_neon.h
+++ b/mace/ops/arm/conv_2d_neon.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef MACE_OPS_ARM_CONV_2D_NEON_H_
-#define MACE_OPS_ARM_CONV_2D_NEON_H_
-#include "mace/core/types.h"
-#include "mace/ops/sgemm.h"
-namespace mace {
-namespace ops {
-void Conv2dNeonK1x1S1(const float *input,
-                      const float *filter,
-                      const index_t batch,
-                      const index_t height,
-                      const index_t width,
-                      const index_t in_channels,
-                      const index_t out_channels,
-                      float *output,
-                      SGemm *sgemm,
-                      ScratchBuffer *scratch_buffer);
-void Conv2dNeonK3x3S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-void Conv2dNeonK3x3S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-void Conv2dNeonK5x5S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-void Conv2dNeonK1x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-void Conv2dNeonK7x1S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-void Conv2dNeonK7x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-void Conv2dNeonK7x7S2(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-void Conv2dNeonK7x7S3(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output);
-void Conv2dNeonK1x15S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output);
-void Conv2dNeonK15x1S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output);
-// calculate one output channel and one input channel
-inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
-                               const float *filter_ptr,
-                               const index_t in_width,
-                               const index_t filter_height,
-                               const index_t filter_width,
-                               const index_t out_height,
-                               const index_t out_width,
-                               float *out_ptr,
-                               const int stride) {
-  for (index_t h = 0; h < out_height; ++h) {
-    for (index_t w = 0; w < out_width; ++w) {
-      for (int i = 0; i < filter_height; ++i) {
-        for (int j = 0; j < filter_width; ++j) {
-          out_ptr[h * out_width + w] +=
-              in_ptr[(h * stride + i) * in_width + (w * stride + j)] *
-              filter_ptr[i * filter_width + j];
-        }
-      }
-    }
-  }
-}
-}  // namespace ops
-}  // namespace mace
-#endif  // MACE_OPS_ARM_CONV_2D_NEON_H_
--- a/mace/ops/arm/conv_2d_neon_15x1.cc
+++ b/mace/ops/arm/conv_2d_neon_15x1.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-#include "mace/ops/arm/conv_2d_neon.h"
-#include "mace/utils/utils.h"
-namespace mace {
-namespace ops {
-inline void Conv2dCPUK15x1Calc(const float *in_ptr,
-                               const float *filter_ptr,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t w,
-                               const index_t tile_width,
-                               const index_t out_image_size,
-                               float *out_ptr,
-                               const index_t io,
-                               const int stride) {
-  for (index_t ih = 0; ih < out_height; ++ih) {
-    for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) {
-      for (int i = 0; i < 15; ++i) {
-        for (int j = 0; j < 1; ++j) {
-          out_ptr[io * out_image_size + ih * out_width + w + iw] +=
-              in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)] *
-              filter_ptr[io * in_channels * 15 + i * 1 + j];
-        }
-      }
-    }
-  }
-}
-// Ho = 4, Wo = 1, Co = 1
-void Conv2dNeonK15x1S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-  const index_t tile_width =
-      out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      for (index_t w = 0; w < out_shape[3]; w += tile_width) {
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t in_channels = in_shape[1];
-        const index_t in_width = in_shape[3];
-        float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
-          /* load filter (1 outch x 4 height x 1 width) */
-          float32x4_t vf0, vf1, vf2, vf3;
-          vf0 = vld1q_f32(filter_ptr);
-          vf1 = vld1q_f32(filter_ptr + 4);
-          vf2 = vld1q_f32(filter_ptr + 8);
-          vf3 = vld1q_f32(filter_ptr + 11);
-          for (index_t h = 0; h + 3 < out_height; h += 4) {
-            for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) {
-              // load output
-              index_t out_offset = h * out_width + w + wt;
-              // output (1 outch x 4 height x 1 width): vo_outch_height
-              float32x4_t vo = {out_ptr_base[out_offset],
-                                out_ptr_base[out_offset + out_width],
-                                out_ptr_base[out_offset + 2 * out_width],
-                                out_ptr_base[out_offset + 3 * out_width]};
-              // input offset
-              index_t in_offset = h * in_width + w + wt;
-              // input (3 slide)
-              float32x4_t vi0 = {in_ptr_base[in_offset],
-                                 in_ptr_base[in_offset + in_width],
-                                 in_ptr_base[in_offset + 2 * in_width],
-                                 in_ptr_base[in_offset + 3 * in_width]};
-              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                 in_ptr_base[in_offset + 5 * in_width],
-                                 in_ptr_base[in_offset + 6 * in_width],
-                                 in_ptr_base[in_offset + 7 * in_width]};
-              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                 in_ptr_base[in_offset + 9 * in_width],
-                                 in_ptr_base[in_offset + 10 * in_width],
-                                 in_ptr_base[in_offset + 11 * in_width]};
-              float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
-                                  in_ptr_base[in_offset + 13 * in_width],
-                                  in_ptr_base[in_offset + 14 * in_width],
-                                  in_ptr_base[in_offset + 15 * in_width]};
-              float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
-                                  in_ptr_base[in_offset + 17 * in_width]};
-              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
-              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
-              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
-              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
-              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
-              float32x4_t vi7 = vextq_f32(vi4, vi8, 3);
-              float32x4_t vi9 = vextq_f32(vi8, vi12, 1);
-              float32x4_t vi10 = vextq_f32(vi8, vi12, 2);
-              float32x4_t vi11 = vextq_f32(vi8, vi12, 3);
-              float32x4_t vi13 = vextq_f32(vi12, vi16, 1);
-              float32x4_t vi14 = vextq_f32(vi12, vi16, 2);
-              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
-              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
-              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
-              out_ptr_base[out_offset] = vo[0];
-              out_ptr_base[out_offset + out_width] = vo[1];
-              out_ptr_base[out_offset + 2 * out_width] = vo[2];
-              out_ptr_base[out_offset + 3 * out_width] = vo[3];
-            }  // wt
-          }    // h
-#else
-          Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels,
-                             out_height, out_width, w, tile_width,
-                             out_image_size, out_ptr_base, 0, 1);
-#endif
-        }  // c
-      }    // w
-    }      // m
-  }        // b
-}
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/arm/conv_2d_neon_1x15.cc
+++ b/mace/ops/arm/conv_2d_neon_1x15.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-#include "mace/ops/arm/conv_2d_neon.h"
-#include "mace/utils/logging.h"
-#include "mace/utils/utils.h"
-namespace mace {
-namespace ops {
-inline void Conv2dCPUK1x15Calc(const float *in_ptr,
-                               const float *filter_ptr,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t h,
-                               const index_t tile_height,
-                               const index_t out_width,
-                               const index_t out_image_size,
-                               float *out_ptr,
-                               const index_t io,
-                               const int stride) {
-  for (index_t ih = 0; ih < tile_height && h + ih < out_height; ++ih) {
-    for (index_t iw = 0; iw < out_width; ++iw) {
-      for (int i = 0; i < 1; ++i) {
-        for (int j = 0; j < 15; ++j) {
-          out_ptr[io * out_image_size + (h + ih) * out_width + iw] +=
-              in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)] *
-              filter_ptr[io * in_channels * 15 + i * 15 + j];
-        }
-      }
-    }
-  }
-}
-// Ho = 1, Wo = 4, Co = 1
-void Conv2dNeonK1x15S1(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-  const index_t tile_height =
-      out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
-#pragma omp parallel for collapse(3) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; ++m) {
-      for (index_t h = 0; h < out_shape[2]; h += tile_height) {
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t in_channels = in_shape[1];
-        const index_t in_width = in_shape[3];
-        float *out_ptr_base = output + b * out_batch_size + m * out_image_size;
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
-          /* load filter (1 outch x 4 height x 1 width) */
-          float32x4_t vf0, vf1, vf2, vf3;
-          vf0 = vld1q_f32(filter_ptr);
-          vf1 = vld1q_f32(filter_ptr + 4);
-          vf2 = vld1q_f32(filter_ptr + 8);
-          vf3 = vld1q_f32(filter_ptr + 11);
-          for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // output (1 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo;
-              // load output
-              index_t out_offset = (h + ht) * out_width + w;
-              vo = vld1q_f32(out_ptr_base + out_offset);
-              // input (3 slide)
-              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
-                  vi10, vi11, vi12, vi13, vi14, vi16;
-              // input offset
-              index_t in_offset = (h + ht) * in_width + w;
-              // load input
-              vi0 = vld1q_f32(in_ptr_base + in_offset);
-              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-              vi12 = vld1q_f32(in_ptr_base + in_offset + 12);
-              vi16 = vld1q_f32(in_ptr_base + in_offset + 16);
-              vi1 = vextq_f32(vi0, vi4, 1);
-              vi2 = vextq_f32(vi0, vi4, 2);
-              vi3 = vextq_f32(vi0, vi4, 3);
-              vi5 = vextq_f32(vi4, vi8, 1);
-              vi6 = vextq_f32(vi4, vi8, 2);
-              vi7 = vextq_f32(vi4, vi8, 3);
-              vi9 = vextq_f32(vi8, vi12, 1);
-              vi10 = vextq_f32(vi8, vi12, 2);
-              vi11 = vextq_f32(vi8, vi12, 3);
-              vi13 = vextq_f32(vi12, vi16, 1);
-              vi14 = vextq_f32(vi12, vi16, 2);
-              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
-              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
-              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
-              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
-              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
-              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
-              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
-              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
-              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
-              vst1q_f32(out_ptr_base + out_offset, vo);
-            }  // w
-          }    // ht
-#else
-          Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels,
-                             out_height, h, tile_height, out_width,
-                             out_image_size, out_ptr_base, 0, 1);
-#endif
-        }  // c
-      }    // h
-    }      // m
-  }        // b
-}
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/arm/conv_2d_neon_1x7.cc
+++ b/mace/ops/arm/conv_2d_neon_1x7.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-#include "mace/ops/arm/conv_2d_neon.h"
-namespace mace {
-namespace ops {
-// Ho = 1, Wo = 4, Co = 4
-void Conv2dNeonK1x7S1(const float *input,
-                      const float *filter,
-                      const index_t *in_shape,
-                      const index_t *out_shape,
-                      float *output) {
-  const index_t in_image_size = in_shape[2] * in_shape[3];
-  const index_t out_image_size = out_shape[2] * out_shape[3];
-  const index_t in_batch_size = in_shape[1] * in_image_size;
-  const index_t out_batch_size = out_shape[1] * out_image_size;
-#pragma omp parallel for collapse(2) schedule(runtime)
-  for (index_t b = 0; b < out_shape[0]; ++b) {
-    for (index_t m = 0; m < out_shape[1]; m += 4) {
-      const index_t out_channels = out_shape[1];
-      const index_t out_height = out_shape[2];
-      const index_t out_width = out_shape[3];
-      const index_t in_channels = in_shape[1];
-      const index_t in_width = in_shape[3];
-      if (m + 3 < out_channels) {
-        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
-#if defined(MACE_ENABLE_NEON)
-        float *out_ptr1_base =
-            output + b * out_batch_size + (m + 1) * out_image_size;
-        float *out_ptr2_base =
-            output + b * out_batch_size + (m + 2) * out_image_size;
-        float *out_ptr3_base =
-            output + b * out_batch_size + (m + 3) * out_image_size;
-#endif
-        for (index_t c = 0; c < in_channels; ++c) {
-          const float *in_ptr_base =
-              input + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr0 = filter + m * in_channels * 7 + c * 7;
-#if defined(MACE_ENABLE_NEON)
-          const float *filter_ptr1 = filter + (m + 1) * in_channels * 7 + c * 7;
-          const float *filter_ptr2 = filter + (m + 2) * in_channels * 7 + c * 7;
-          const float *filter_ptr3 = filter + (m + 3) * in_channels * 7 + c * 7;
-          /* load filter (4 outch x 1 height x 4 width) */
-          float32x4_t vf00, vf01;
-          float32x4_t vf10, vf11;
-          float32x4_t vf20, vf21;
-          float32x4_t vf30, vf31;
-          vf00 = vld1q_f32(filter_ptr0);
-          vf01 = vld1q_f32(filter_ptr0 + 3);
-          vf10 = vld1q_f32(filter_ptr1);
-          vf11 = vld1q_f32(filter_ptr1 + 3);
-          vf20 = vld1q_f32(filter_ptr2);
-          vf21 = vld1q_f32(filter_ptr2 + 3);
-          vf30 = vld1q_f32(filter_ptr3);
-          vf31 = vld1q_f32(filter_ptr3 + 3);
-          for (index_t h = 0; h < out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
-              // output (4 outch x 1 height x 4 width): vo_outch_height
-              float32x4_t vo0, vo1, vo2, vo3;
-              // load output
-              index_t out_offset = h * out_width + w;
-              vo0 = vld1q_f32(out_ptr0_base + out_offset);
-              vo1 = vld1q_f32(out_ptr1_base + out_offset);
-              vo2 = vld1q_f32(out_ptr2_base + out_offset);
-              vo3 = vld1q_f32(out_ptr3_base + out_offset);
-              // input (3 slide)
-              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
-              // input offset
-              index_t in_offset = h * in_width + w;
-              // load input
-              vi0 = vld1q_f32(in_ptr_base + in_offset);
-              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-              vi1 = vextq_f32(vi0, vi4, 1);
-              vi2 = vextq_f32(vi0, vi4, 2);
-              vi3 = vextq_f32(vi0, vi4, 3);
-              vi5 = vextq_f32(vi4, vi8, 1);
-              vi6 = vextq_f32(vi4, vi8, 2);
-#if defined(__aarch64__)
-              /* outch 0 */
-              vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-              vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-              vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-              vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-              vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-              /* outch 1 */
-              vo1 = vfmaq_laneq_f32(vo1, vi0, vf10, 0);
-              vo1 = vfmaq_laneq_f32(vo1, vi1, vf10, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi2, vf10, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi3, vf10, 3);
-              vo1 = vfmaq_laneq_f32(vo1, vi4, vf11, 1);
-              vo1 = vfmaq_laneq_f32(vo1, vi5, vf11, 2);
-              vo1 = vfmaq_laneq_f32(vo1, vi6, vf11, 3);
-              /* outch 2 */
-              vo2 = vfmaq_laneq_f32(vo2, vi0, vf20, 0);
-              vo2 = vfmaq_laneq_f32(vo2, vi1, vf20, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi2, vf20, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi3, vf20, 3);
-              vo2 = vfmaq_laneq_f32(vo2, vi4, vf21, 1);
-              vo2 = vfmaq_laneq_f32(vo2, vi5, vf21, 2);
-              vo2 = vfmaq_laneq_f32(vo2, vi6, vf21, 3);
-              /* outch 3 */
-              vo3 = vfmaq_laneq_f32(vo3, vi0, vf30, 0);
-              vo3 = vfmaq_laneq_f32(vo3, vi1, vf30, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi2, vf30, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi3, vf30, 3);
-              vo3 = vfmaq_laneq_f32(vo3, vi4, vf31, 1);
-              vo3 = vfmaq_laneq_f32(vo3, vi5, vf31, 2);
-              vo3 = vfmaq_laneq_f32(vo3, vi6, vf31, 3);
-#else
-              /* outch 0 */
-              vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-              vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-              vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-              /* outch 1 */
-              vo1 = vmlaq_lane_f32(vo1, vi0, vget_low_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi1, vget_low_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi2, vget_high_f32(vf10), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi3, vget_high_f32(vf10), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi4, vget_low_f32(vf11), 1);
-              vo1 = vmlaq_lane_f32(vo1, vi5, vget_high_f32(vf11), 0);
-              vo1 = vmlaq_lane_f32(vo1, vi6, vget_high_f32(vf11), 1);
-              /* outch 2 */
-              vo2 = vmlaq_lane_f32(vo2, vi0, vget_low_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi1, vget_low_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi2, vget_high_f32(vf20), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi3, vget_high_f32(vf20), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi4, vget_low_f32(vf21), 1);
-              vo2 = vmlaq_lane_f32(vo2, vi5, vget_high_f32(vf21), 0);
-              vo2 = vmlaq_lane_f32(vo2, vi6, vget_high_f32(vf21), 1);
-              /* outch 3 */
-              vo3 = vmlaq_lane_f32(vo3, vi0, vget_low_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi1, vget_low_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi2, vget_high_f32(vf30), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi3, vget_high_f32(vf30), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi4, vget_low_f32(vf31), 1);
-              vo3 = vmlaq_lane_f32(vo3, vi5, vget_high_f32(vf31), 0);
-              vo3 = vmlaq_lane_f32(vo3, vi6, vget_high_f32(vf31), 1);
-#endif
-              vst1q_f32(out_ptr0_base + out_offset, vo0);
-              vst1q_f32(out_ptr1_base + out_offset, vo1);
-              vst1q_f32(out_ptr2_base + out_offset, vo2);
-              vst1q_f32(out_ptr3_base + out_offset, vo3);
-            }  // w
-          }    // h
-#else
-          for (index_t oc = 0; oc < 4; ++oc) {
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0 + oc * in_channels * 7,
-                               in_width, 1, 7, out_height, out_width,
-                               out_ptr0_base + oc * out_image_size, 1);
-          }
-#endif
-        }  // c
-      } else {
-        for (index_t mm = m; mm < out_channels; ++mm) {
-          float *out_ptr0_base =
-              output + b * out_batch_size + mm * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
-            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 = filter + mm * in_channels * 7 + c * 7;
-#if defined(MACE_ENABLE_NEON)
-            /* load filter (1 outch x 1 height x 4 width) */
-            float32x4_t vf00, vf01;
-            vf00 = vld1q_f32(filter_ptr0);
-            vf01 = vld1q_f32(filter_ptr0 + 3);
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
-                // output (1 outch x 1 height x 4 width): vo_outch_height
-                float32x4_t vo0;
-                // load output
-                index_t out_offset = h * out_width + w;
-                vo0 = vld1q_f32(out_ptr0_base + out_offset);
-                // input (3 slide)
-                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
-                // input offset
-                index_t in_offset = h * in_width + w;
-                // load input
-                vi0 = vld1q_f32(in_ptr_base + in_offset);
-                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
-                vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
-                vi1 = vextq_f32(vi0, vi4, 1);
-                vi2 = vextq_f32(vi0, vi4, 2);
-                vi3 = vextq_f32(vi0, vi4, 3);
-                vi5 = vextq_f32(vi4, vi8, 1);
-                vi6 = vextq_f32(vi4, vi8, 2);
-#if defined(__aarch64__)
-                vo0 = vfmaq_laneq_f32(vo0, vi0, vf00, 0);
-                vo0 = vfmaq_laneq_f32(vo0, vi1, vf00, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi2, vf00, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi3, vf00, 3);
-                vo0 = vfmaq_laneq_f32(vo0, vi4, vf01, 1);
-                vo0 = vfmaq_laneq_f32(vo0, vi5, vf01, 2);
-                vo0 = vfmaq_laneq_f32(vo0, vi6, vf01, 3);
-#else
-                vo0 = vmlaq_lane_f32(vo0, vi0, vget_low_f32(vf00), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi1, vget_low_f32(vf00), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi2, vget_high_f32(vf00), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi4, vget_low_f32(vf01), 1);
-                vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0);
-                vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-#endif
-                vst1q_f32(out_ptr0_base + out_offset, vo0);
-              }  // w
-            }    // h
-#else
-            Conv2dCPUKHxKWCalc(in_ptr_base, filter_ptr0, in_width, 1, 7,
-                               out_height, out_width, out_ptr0_base, 1);
-#endif
-          }  // c
-        }
-      }  // if
-    }    // m
-  }      // b
-}
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/arm/conv_winograd.h
+++ b/mace/ops/arm/conv_winograd.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef MACE_OPS_ARM_CONV_WINOGRAD_H_
-#define MACE_OPS_ARM_CONV_WINOGRAD_H_
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-#include "mace/core/types.h"
-#include "mace/ops/sgemm.h"
-namespace mace {
-namespace ops {
-void TransformFilter4x4(const float *filter,
-                        const index_t in_channels,
-                        const index_t out_channels,
-                        float *output);
-void TransformFilter8x8(const float *filter,
-                        const index_t in_channels,
-                        const index_t out_channels,
-                        float *output);
-void WinoGradConv3x3s1(const float *input,
-                       const float *filter,
-                       const index_t batch,
-                       const index_t in_height,
-                       const index_t in_width,
-                       const index_t in_channels,
-                       const index_t out_channels,
-                       const int out_tile_size,
-                       float *output,
-                       SGemm *sgemm,
-                       ScratchBuffer *scratch_buffer);
-void WinoGradConv3x3s1(const float *input,
-                       const float *transformed_filter,
-                       const index_t batch,
-                       const index_t in_height,
-                       const index_t in_width,
-                       const index_t in_channels,
-                       const index_t out_channels,
-                       const int out_tile_size,
-                       float *transformed_input,
-                       float *transformed_output,
-                       float *output,
-                       SGemm *sgemm,
-                       ScratchBuffer *scratch_buffer);
-void ConvRef3x3s1(const float *input,
-                  const float *filter,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t in_channels,
-                  const index_t out_channels,
-                  float *output);
-}  // namespace ops
-}  // namespace mace
-#endif  // MACE_OPS_ARM_CONV_WINOGRAD_H_
--- a/mace/ops/arm/conv_winograd_test.cc
+++ b/mace/ops/arm/conv_winograd_test.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <memory>
-#include <random>
-#include "mace/core/tensor.h"
-#include "mace/core/types.h"
-#include "mace/ops/arm/conv_winograd.h"
-namespace mace {
-namespace ops {
-TEST(ConvWinogradTest, winograd) {
-  index_t batch = 1;
-  index_t in_height = 32;
-  index_t in_width = 32;
-  index_t in_channels = 64;
-  index_t out_channels = 128;
-  index_t out_height = in_height - 2;
-  index_t out_width = in_width - 2;
-  index_t input_size = batch * in_channels * in_height * in_width;
-  index_t filter_size = 3 * 3 * in_channels * out_channels;
-  index_t output_size = batch * out_channels * out_height * out_width;
-  Tensor input(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor filter(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor output(GetCPUAllocator(), DataType::DT_FLOAT);
-  Tensor output_ref(GetCPUAllocator(), DataType::DT_FLOAT);
-  input.Resize({batch, in_channels, in_height, in_width});
-  filter.Resize({out_channels, in_channels, 3, 3});
-  output.Resize({batch, out_channels, out_height, out_width});
-  output_ref.Resize({batch, out_channels, out_height, out_width});
-  float *input_data = input.mutable_data<float>();
-  float *filter_data = filter.mutable_data<float>();
-  float *output_data = output.mutable_data<float>();
-  float *output_data_ref = output.mutable_data<float>();
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::normal_distribution<float> nd(0, 1);
-  std::generate(input_data, input_data + input_size, [&gen, &nd] {
-    return std::max(-1.0f, std::min(1.0f, nd(gen)));
-  });
-  std::generate(filter_data, filter_data + filter_size, [&gen, &nd] {
-    return std::max(-1.0f, std::min(1.0f, nd(gen)));
-  });
-  ops::ConvRef3x3s1(input_data, filter_data, batch, in_height, in_width,
-                        in_channels, out_channels, output_data_ref);
-  SGemm sgemm;
-  ops::WinoGradConv3x3s1(input_data, filter_data, batch, in_height,
-                             in_width, in_channels, out_channels, 6,
-                             output_data, &sgemm, nullptr);
-  // test
-  for (index_t i = 0; i < output_size; ++i) {
-    EXPECT_NEAR(output_data_ref[i], output_data[i], 0.1) << " with index " << i;
-  }
-}
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/arm/deconv_2d_neon.h
+++ b/mace/ops/arm/deconv_2d_neon.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/deconv_2d_neon_2x2.cc
+++ b/mace/ops/arm/deconv_2d_neon_2x2.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/ops/arm/deconv_2d_neon.h"
 namespace mace {

--- a/mace/ops/arm/deconv_2d_neon_3x3.cc
+++ b/mace/ops/arm/deconv_2d_neon_3x3.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/ops/arm/deconv_2d_neon.h"
 namespace mace {

--- a/mace/ops/arm/deconv_2d_neon_4x4.cc
+++ b/mace/ops/arm/deconv_2d_neon_4x4.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/ops/arm/deconv_2d_neon.h"
 namespace mace {

--- a/mace/ops/arm/depthwise_conv2d_neon.h
+++ b/mace/ops/arm/depthwise_conv2d_neon.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/ops/arm/depthwise_conv2d_neon.h"
 namespace mace {

--- a/mace/ops/arm/depthwise_deconv2d_neon.h
+++ b/mace/ops/arm/depthwise_deconv2d_neon.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/ops/arm/depthwise_deconv2d_neon.h"
 namespace mace {

--- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/core/macros.h"
+#include "mace/utils/macros.h"
 #include "mace/ops/arm/deconv_2d_neon.h"
 namespace mace {

--- a/mace/ops/arm/fp32/conv_2d.cc
+++ b/mace/ops/arm/fp32/conv_2d.cc
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <utility>
+#include <algorithm>
+#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/utils/memory.h"
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+void Conv2dBase::CalOutputShapeAndPadSize(const Tensor *input,
+                                          const Tensor *filter,
+                                          const int out_tile_height,
+                                          const int out_tile_width,
+                                          std::vector<index_t> *output_shape,
+                                          std::vector<int> *in_pad_size,
+                                          std::vector<int> *out_pad_size) {
+  in_pad_size->resize(4);
+  out_pad_size->resize(4);
+  output_shape->resize(4);
+  const index_t in_height = input->dim(2);
+  const index_t in_width = input->dim(3);
+  const index_t stride_h = strides_[0];
+  const index_t stride_w = strides_[1];
+  const index_t dilation_h = dilations_[0];
+  const index_t dilation_w = dilations_[1];
+  const index_t filter_h = filter->dim(2);
+  const index_t filter_w = filter->dim(3);
+  std::vector<int> paddings(2);
+  if (paddings_.empty()) {
+    CalcNCHWPaddingAndOutputSize(input->shape().data(),
+                                 filter->shape().data(),
+                                 dilations_.data(),
+                                 strides_.data(),
+                                 padding_type_,
+                                 output_shape->data(),
+                                 paddings.data());
+  } else {
+    paddings = paddings_;
+    CalcNCHWOutputSize(input->shape().data(),
+                       filter->shape().data(),
+                       paddings_.data(),
+                       dilations_.data(),
+                       strides_.data(),
+                       RoundType::FLOOR,
+                       output_shape->data());
+  }
+  const index_t out_height = (*output_shape)[2];
+  const index_t out_width = (*output_shape)[3];
+  const index_t
+      padded_out_height = RoundUp<index_t>(out_height, out_tile_height);
+  const index_t padded_out_width = RoundUp<index_t>(out_width, out_tile_width);
+  const index_t padded_in_height =
+      std::max(in_height + paddings[0], (padded_out_height - 1) * stride_h
+          + (filter_h - 1) * dilation_h + 1);
+  const index_t padded_in_width =
+      std::max(in_width + paddings[1], (padded_out_width - 1) * stride_w
+          + (filter_w - 1) * dilation_w + 1);
+  (*in_pad_size)[0] = paddings[0] >> 1;
+  (*in_pad_size)[1] =
+      static_cast<int>(padded_in_height - in_height - (*in_pad_size)[0]);
+  (*in_pad_size)[2] = paddings[1] >> 1;
+  (*in_pad_size)[3] =
+      static_cast<int>(padded_in_width - in_width - (*in_pad_size)[2]);
+  (*out_pad_size)[0] = 0;
+  (*out_pad_size)[1] = static_cast<int>(padded_out_height - out_height);
+  (*out_pad_size)[2] = 0;
+  (*out_pad_size)[3] = static_cast<int>(padded_out_width - out_width);
+}
+MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
+                                            const Tensor *input,
+                                            const Tensor *filter,
+                                            Tensor *output,
+                                            const int out_tile_height,
+                                            const int out_tile_width,
+                                            std::unique_ptr<const Tensor>
+                                                *padded_input,
+                                            std::unique_ptr<Tensor>
+                                                *padded_output) {
+  std::vector<index_t> output_shape;
+  std::vector<int> in_pad_size;
+  std::vector<int> out_pad_size;
+  CalOutputShapeAndPadSize(input,
+                           filter,
+                           out_tile_height,
+                           out_tile_width,
+                           &output_shape,
+                           &in_pad_size,
+                           &out_pad_size);
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+  const index_t batch = input->dim(0);
+  const index_t in_channels = input->dim(1);
+  const index_t in_height = input->dim(2);
+  const index_t in_width = input->dim(3);
+  const index_t out_channels = output->dim(1);
+  const index_t out_height = output->dim(2);
+  const index_t out_width = output->dim(3);
+  const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
+  const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
+  const index_t
+      padded_out_height = out_height + out_pad_size[0] + out_pad_size[1];
+  const index_t
+      padded_out_width = out_width + out_pad_size[2] + out_pad_size[3];
+  const bool is_in_padded =
+      padded_in_height != in_height || padded_in_width != in_width;
+  const bool is_out_padded =
+      padded_out_height != out_height || padded_out_width != out_width;
+  auto scratch_buffer = context->device()->scratch_buffer();
+  const index_t padded_in_size =
+      MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize(
+          sizeof(float) * batch * in_channels * padded_in_height
+              * padded_in_width) : 0);
+  const index_t padded_out_size = is_out_padded ? PadAlignSize(
+      sizeof(float) * batch * out_channels * padded_out_height
+          * padded_out_width) : 0;
+  scratch_buffer->Rewind();
+  scratch_buffer->GrowSize(padded_in_size + padded_out_size);
+  if (is_in_padded) {
+    std::unique_ptr<Tensor>
+        padded_in =
+        make_unique<Tensor>(scratch_buffer->Scratch(padded_in_size),
+                            DataType::DT_FLOAT);
+    padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width});
+    PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get());
+    *padded_input = std::move(padded_in);
+  }
+  if (is_out_padded) {
+    std::unique_ptr<Tensor>
+    padded_out = make_unique<Tensor>(scratch_buffer->Scratch(padded_out_size),
+                                     DataType::DT_FLOAT);
+    padded_out->Resize({batch, out_channels, padded_out_height,
+                        padded_out_width});
+    *padded_output = std::move(padded_out);
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+void Conv2dBase::PadInput(const Tensor &src,
+                          const int pad_top,
+                          const int pad_left,
+                          mace::Tensor *dst) {
+  if (dst == &src) return;
+  const index_t batch = src.dim(0);
+  const index_t channels = src.dim(1);
+  const index_t height = src.dim(2);
+  const index_t width = src.dim(3);
+  const index_t padded_height = dst->dim(2);
+  const index_t padded_width = dst->dim(3);
+  const int pad_bottom = static_cast<int>(padded_height - height - pad_top);
+  const int pad_right = static_cast<int>(padded_width - width - pad_left);
+  auto in_data = src.data<float>();
+  auto padded_in_data = dst->mutable_data<float>();
+  const index_t img_size = height * width;
+  const index_t padded_img_size = padded_height * padded_width;
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const index_t bc = b * channels + c;
+      const float *in_base = in_data + bc * img_size;
+      float *padded_in_base = padded_in_data + bc * padded_img_size;
+      memset(padded_in_base, 0, sizeof(float) * pad_top * padded_width);
+      padded_in_base += pad_top * padded_width;
+      for (index_t h = 0; h < height; ++h) {
+        memset(padded_in_base,
+               0,
+               sizeof(float) * pad_left);
+        memcpy(padded_in_base + pad_left,
+               in_base,
+               sizeof(float) * width);
+        memset(padded_in_base + pad_left + width,
+               0,
+               sizeof(float) * pad_right);
+        in_base += width;
+        padded_in_base += padded_width;
+      }
+      memset(padded_in_base, 0, sizeof(float) * pad_bottom * padded_width);
+    }
+  }
+}
+void Conv2dBase::UnPadOutput(const mace::Tensor &src, mace::Tensor *dst) {
+  if (dst == &src) return;
+  const index_t batch = dst->dim(0);
+  const index_t channels = dst->dim(1);
+  const index_t height = dst->dim(2);
+  const index_t width = dst->dim(3);
+  const index_t padded_height = src.dim(2);
+  const index_t padded_width = src.dim(3);
+  auto padded_out_data = src.data<float>();
+  auto out_data = dst->mutable_data<float>();
+  const index_t img_size = height * width;
+  const index_t padded_img_size = padded_height * padded_width;
+#pragma omp parallel for collapse(2) schedule(runtime)
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const index_t bc = (b * channels + c);
+      float *out_base = out_data + bc * img_size;
+      const float *padded_out_base = padded_out_data + bc * padded_img_size;
+      for (index_t h = 0; h < height; ++h) {
+        memcpy(out_base,
+               padded_out_base,
+               sizeof(float) * width);
+        out_base += width;
+        padded_out_base += padded_width;
+      }  // h
+    }  // c
+  }  // b
+}
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d.h
+++ b/mace/ops/arm/fp32/conv_2d.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,10 +15,14 @@
 #ifndef MACE_OPS_ARM_FP32_CONV_2D_H_
 #define MACE_OPS_ARM_FP32_CONV_2D_H_
+#include <vector>
+#include <memory>
 #include "mace/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/core/op_context.h"
 #include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
 namespace mace {
 namespace ops {
@@ -27,13 +31,51 @@ namespace fp32 {
 class Conv2dBase {
 public:
-  Conv2dBase() = default;
+  Conv2dBase(const std::vector<int> strides,
+             const std::vector<int> dilations,
+             const std::vector<int> paddings,
+             const Padding padding_type)
+      : strides_(strides),
+        dilations_(dilations),
+        paddings_(paddings),
+        padding_type_(padding_type) {}
  virtual ~Conv2dBase() = default;
  virtual MaceStatus Compute(
      const OpContext *context,
      const Tensor *input,
      const Tensor *filter,
      Tensor *output) = 0;
+ protected:
+  void CalOutputShapeAndPadSize(const Tensor *input,
+                                const Tensor *filter,
+                                const int out_tile_height,
+                                const int out_tile_width,
+                                std::vector<index_t> *output_shape,
+                                std::vector<int> *in_pad_size,
+                                std::vector<int> *out_pad_size);
+  MaceStatus ResizeOutAndPadInOut(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output,
+                                  const int out_tile_height,
+                                  const int out_tile_width,
+                                  std::unique_ptr<const Tensor> *padded_input,
+                                  std::unique_ptr<Tensor> *padded_output);
+  void PadInput(const Tensor &src,
+                const int pad_top,
+                const int pad_left,
+                Tensor *dst);
+  void UnPadOutput(const Tensor &src, Tensor *dst);
+  const std::vector<int> strides_;
+  const std::vector<int> dilations_;
+  const std::vector<int> paddings_;
+  const Padding padding_type_;
 };
 }  // namespace fp32

--- a/mace/ops/arm/fp32/conv_2d_1x1.cc
+++ b/mace/ops/arm/fp32/conv_2d_1x1.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "mace/ops/arm/fp32/conv_2d_1x1.h"
 namespace mace {
@@ -25,20 +24,68 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
                               const Tensor *filter,
                               Tensor *output) {
  index_t batch = input->dim(0);
-  index_t height = input->dim(2);
+  index_t in_height = input->dim(2);
-  index_t width = input->dim(3);
+  index_t in_width = input->dim(3);
  index_t in_channels = input->dim(1);
-  index_t out_channels = filter->dim(0);
-  MACE_RETURN_IF_ERROR(output->Resize({batch, out_channels, height, width}));
+  std::vector<index_t> output_shape;
-  context->device()->scratch_buffer()->Rewind();
+  std::vector<int> in_pad_size;
+  std::vector<int> out_pad_size;
+  CalOutputShapeAndPadSize(input,
+                           filter,
+                           1,
+                           1,
+                           &output_shape,
+                           &in_pad_size,
+                           &out_pad_size);
+  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+  const index_t out_channels = output_shape[1];
+  const index_t out_height = output_shape[2];
+  const index_t out_width = output_shape[3];
+  const index_t padded_in_height = in_height + in_pad_size[0] + in_pad_size[1];
+  const index_t padded_in_width = in_width + in_pad_size[2] + in_pad_size[3];
+  // pad input and transform input
+  const bool is_in_padded =
+      in_height != padded_in_height || in_width != padded_in_width;
+  auto scratch_buffer = context->device()->scratch_buffer();
+  const index_t padded_in_size = is_in_padded ? PadAlignSize(
+      sizeof(float) * batch * in_channels * padded_in_height
+          * padded_in_width) : 0;
+  const index_t pack_filter_size =
+      PadAlignSize(sizeof(float) * out_channels * in_channels);
+  const index_t pack_input_size =
+      PadAlignSize(
+          sizeof(float) * in_channels * padded_in_height * padded_in_width);
+  const index_t pack_output_size =
+      PadAlignSize(
+          sizeof(float) * out_channels * padded_in_height * padded_in_width);
+  const index_t gemm_pack_size =
+      pack_filter_size + pack_input_size + pack_output_size;
+  scratch_buffer->Rewind();
+  scratch_buffer->GrowSize(padded_in_size + gemm_pack_size);
+  const Tensor *padded_in = input;
+  Tensor tmp_padded_in
+      (scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
+  if (is_in_padded) {
+    tmp_padded_in.Resize({batch, in_channels, padded_in_height,
+                          padded_in_width});
+    PadInput(*input, in_pad_size[0], in_pad_size[2], &tmp_padded_in);
+    padded_in = &tmp_padded_in;
+  }
  return gemm_.Compute(context,
                       filter,
-                       input,
+                       padded_in,
                       batch,
                       out_channels,
                       in_channels,
                       in_channels,
-                       height * width,
+                       out_height * out_width,
                       false,
                       false,
                       false,

--- a/mace/ops/arm/fp32/conv_2d_1x1.h
+++ b/mace/ops/arm/fp32/conv_2d_1x1.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 #ifndef MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
 #define MACE_OPS_ARM_FP32_CONV_2D_1X1_H_
+#include <vector>
 #include "mace/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/core/op_context.h"
@@ -28,7 +29,8 @@ namespace fp32 {
 class Conv2dK1x1 : public Conv2dBase {
 public:
-  Conv2dK1x1() : gemm_(true) {}
+  Conv2dK1x1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
  virtual ~Conv2dK1x1() {}
  MaceStatus Compute(

--- a/mace/ops/arm/conv_2d_neon_7x1.cc
+++ b/mace/ops/arm/conv_2d_neon_7x1.cc
--- a/mace/ops/arm/fp32/conv_2d_1xn.h
+++ b/mace/ops/arm/fp32/conv_2d_1xn.h
--- a/mace/ops/arm/conv_2d_neon_3x3.cc
+++ b/mace/ops/arm/conv_2d_neon_3x3.cc
--- a/mace/ops/arm/fp32/conv_2d_3x3.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3.h
+// Copyright 2019 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+#define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/core/tensor.h"
+#include "mace/core/op_context.h"
+#include "mace/ops/arm/fp32/conv_2d.h"
+namespace mace {
+namespace ops {
+namespace arm {
+namespace fp32 {
+class Conv2dK3x3S1 : public Conv2dBase {
+ public:
+  Conv2dK3x3S1(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({1, 1}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK3x3S1() {}
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+class Conv2dK3x3S2 : public Conv2dBase {
+ public:
+  Conv2dK3x3S2(const std::vector<int> paddings, const Padding padding_type)
+      : Conv2dBase({2, 2}, {1, 1}, paddings, padding_type) {}
+  virtual ~Conv2dK3x3S2() {}
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output);
+};
+}  // namespace fp32
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
--- a/mace/ops/arm/conv_winograd.cc
+++ b/mace/ops/arm/conv_winograd.cc
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
--- a/mace/ops/arm/conv_2d_neon_5x5.cc
+++ b/mace/ops/arm/conv_2d_neon_5x5.cc
--- a/mace/ops/arm/fp32/conv_2d_5x5.h
+++ b/mace/ops/arm/fp32/conv_2d_5x5.h
--- a/mace/ops/arm/conv_2d_neon_7x7.cc
+++ b/mace/ops/arm/conv_2d_neon_7x7.cc
--- a/mace/ops/arm/fp32/conv_2d_7x7.h
+++ b/mace/ops/arm/fp32/conv_2d_7x7.h
--- a/mace/ops/arm/fp32/conv_general.cc
+++ b/mace/ops/arm/fp32/conv_general.cc
--- a/mace/ops/arm/fp32/conv_general.h
+++ b/mace/ops/arm/fp32/conv_general.h
--- a/mace/ops/arm/fp32/gemm.h
+++ b/mace/ops/arm/fp32/gemm.h
--- a/mace/ops/arm/fp32/gemv.cc
+++ b/mace/ops/arm/fp32/gemv.cc
--- a/mace/ops/arm/fp32/gemv.h
+++ b/mace/ops/arm/fp32/gemv.h
-// Copyright 2018 The MACE Authors. All Rights Reserved.
+// Copyright 2019 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/mace/ops/arm/q8/eltwise.cc
+++ b/mace/ops/arm/q8/eltwise.cc
--- a/mace/ops/arm/q8/eltwise.h
+++ b/mace/ops/arm/q8/eltwise.h
--- a/mace/ops/arm/q8/gemv.cc
+++ b/mace/ops/arm/q8/gemv.cc
--- a/mace/ops/arm/q8/gemv.h
+++ b/mace/ops/arm/q8/gemv.h
--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
--- a/mace/ops/buffer_transform.cc
+++ b/mace/ops/buffer_transform.cc
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
--- a/mace/ops/common/conv_pool_2d_util.cc
+++ b/mace/ops/common/conv_pool_2d_util.cc
--- a/mace/ops/common/conv_pool_2d_util.h
+++ b/mace/ops/common/conv_pool_2d_util.h
--- a/mace/ops/arm/conv_2d_neon_1x1.cc
+++ b/mace/ops/arm/conv_2d_neon_1x1.cc
--- a/mace/ops/gemmlowp_util.h
+++ b/mace/ops/gemmlowp_util.h
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
--- a/mace/ops/crop.cc
+++ b/mace/ops/crop.cc
--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
--- a/mace/ops/cumsum.cc
+++ b/mace/ops/cumsum.cc
--- a/mace/ops/cumsum_benchmark.cc
+++ b/mace/ops/cumsum_benchmark.cc
--- a/mace/ops/cumsum_test.cc
+++ b/mace/ops/cumsum_test.cc
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
--- a/mace/ops/gather_test.cc
+++ b/mace/ops/gather_test.cc
--- a/mace/ops/infer_conv2d_shape.cc
+++ b/mace/ops/infer_conv2d_shape.cc
--- a/mace/ops/infer_conv2d_shape_test.cc
+++ b/mace/ops/infer_conv2d_shape_test.cc
--- a/mace/ops/lstm_cell.cc
+++ b/mace/ops/lstm_cell.cc
--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
--- a/mace/ops/opencl/activation.h
+++ b/mace/ops/opencl/activation.h
--- a/mace/ops/opencl/addn.h
+++ b/mace/ops/opencl/addn.h
--- a/mace/ops/opencl/batch_norm.h
+++ b/mace/ops/opencl/batch_norm.h
--- a/mace/ops/opencl/batch_to_space.h
+++ b/mace/ops/opencl/batch_to_space.h
--- a/mace/ops/opencl/bias_add.h
+++ b/mace/ops/opencl/bias_add.h
--- a/mace/ops/opencl/buffer/conv_2d.h
+++ b/mace/ops/opencl/buffer/conv_2d.h
--- a/mace/ops/opencl/buffer/depthwise_conv2d.h
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.h
--- a/mace/ops/opencl/buffer/pooling.h
+++ b/mace/ops/opencl/buffer/pooling.h
--- a/mace/ops/opencl/buffer_transform_kernel.h
+++ b/mace/ops/opencl/buffer_transform_kernel.h
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
--- a/mace/ops/opencl/channel_shuffle.h
+++ b/mace/ops/opencl/channel_shuffle.h
--- a/mace/ops/opencl/concat.h
+++ b/mace/ops/opencl/concat.h
--- a/mace/ops/opencl/crop.h
+++ b/mace/ops/opencl/crop.h
--- a/mace/ops/opencl/depth_to_space.h
+++ b/mace/ops/opencl/depth_to_space.h
--- a/mace/ops/opencl/eltwise.h
+++ b/mace/ops/opencl/eltwise.h
--- a/mace/ops/opencl/fully_connected.h
+++ b/mace/ops/opencl/fully_connected.h
--- a/mace/ops/opencl/helper.cc
+++ b/mace/ops/opencl/helper.cc
--- a/mace/ops/opencl/helper.h
+++ b/mace/ops/opencl/helper.h
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
--- a/mace/ops/opencl/image/crop.h
+++ b/mace/ops/opencl/image/crop.h
--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
--- a/mace/ops/opencl/lstm_cell.h
+++ b/mace/ops/opencl/lstm_cell.h
--- a/mace/ops/opencl/matmul.h
+++ b/mace/ops/opencl/matmul.h
--- a/mace/ops/opencl/out_of_range_check_test.cc
+++ b/mace/ops/opencl/out_of_range_check_test.cc
--- a/mace/ops/opencl/pad.h
+++ b/mace/ops/opencl/pad.h
--- a/mace/ops/opencl/reduce.h
+++ b/mace/ops/opencl/reduce.h
--- a/mace/ops/opencl/resize_bicubic.h
+++ b/mace/ops/opencl/resize_bicubic.h
--- a/mace/ops/opencl/resize_bilinear.h
+++ b/mace/ops/opencl/resize_bilinear.h
--- a/mace/ops/opencl/resize_nearest_neighbor.h
+++ b/mace/ops/opencl/resize_nearest_neighbor.h
--- a/mace/ops/opencl/softmax.h
+++ b/mace/ops/opencl/softmax.h
--- a/mace/ops/opencl/space_to_batch.h
+++ b/mace/ops/opencl/space_to_batch.h
--- a/mace/ops/opencl/space_to_depth.h
+++ b/mace/ops/opencl/space_to_depth.h
--- a/mace/ops/opencl/split.h
+++ b/mace/ops/opencl/split.h
--- a/mace/ops/opencl/sqrdiff_mean.h
+++ b/mace/ops/opencl/sqrdiff_mean.h
--- a/mace/ops/ops_registry.cc
+++ b/mace/ops/ops_registry.cc
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
--- a/mace/ops/pnorm.cc
+++ b/mace/ops/pnorm.cc
--- a/mace/ops/pnorm_benchmark.cc
+++ b/mace/ops/pnorm_benchmark.cc
--- a/mace/ops/pnorm_test.cc
+++ b/mace/ops/pnorm_test.cc
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
--- a/mace/ops/reduce.cc
+++ b/mace/ops/reduce.cc
--- a/mace/ops/reduce_benchmark.cc
+++ b/mace/ops/reduce_benchmark.cc
--- a/mace/ops/reduce_test.cc
+++ b/mace/ops/reduce_test.cc
--- a/mace/ops/ref/conv_2d.cc
+++ b/mace/ops/ref/conv_2d.cc
--- a/mace/ops/ref/conv_2d.h
+++ b/mace/ops/ref/conv_2d.h
--- a/mace/ops/reshape.cc
+++ b/mace/ops/reshape.cc
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
--- a/mace/ops/resize_nearest_neighbor.cc
+++ b/mace/ops/resize_nearest_neighbor.cc
--- a/mace/ops/sgemm.cc
+++ b/mace/ops/sgemm.cc
--- a/mace/ops/sgemm.h
+++ b/mace/ops/sgemm.h
--- a/mace/ops/sgemm_pack_test.cc
+++ b/mace/ops/sgemm_pack_test.cc
--- a/mace/ops/shape.cc
+++ b/mace/ops/shape.cc
--- a/mace/ops/slice.cc
+++ b/mace/ops/slice.cc
--- a/mace/ops/slice_test.cc
+++ b/mace/ops/slice_test.cc
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
--- a/mace/ops/splice.cc
+++ b/mace/ops/splice.cc
--- a/mace/ops/splice_benchmark.cc
+++ b/mace/ops/splice_benchmark.cc
--- a/mace/ops/splice_test.cc
+++ b/mace/ops/splice_test.cc
--- a/mace/ops/split.cc
+++ b/mace/ops/split.cc
--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
--- a/mace/ops/split_test.cc
+++ b/mace/ops/split_test.cc
--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
--- a/mace/ops/squeeze.cc
+++ b/mace/ops/squeeze.cc
--- a/mace/ops/squeeze_test.cc
+++ b/mace/ops/squeeze_test.cc
--- a/mace/ops/strided_slice.cc
+++ b/mace/ops/strided_slice.cc
--- a/mace/ops/strided_slice_test.cc
+++ b/mace/ops/strided_slice_test.cc
--- a/mace/ops/sum_group.cc
+++ b/mace/ops/sum_group.cc
--- a/mace/ops/sum_group_benchmark.cc
+++ b/mace/ops/sum_group_benchmark.cc
--- a/mace/ops/sum_group_test.cc
+++ b/mace/ops/sum_group_test.cc
--- a/mace/ops/target_rms_norm.cc
+++ b/mace/ops/target_rms_norm.cc
--- a/mace/ops/target_rms_norm_benchmark.cc
+++ b/mace/ops/target_rms_norm_benchmark.cc
--- a/mace/ops/target_rms_norm_test.cc
+++ b/mace/ops/target_rms_norm_test.cc
--- a/mace/ops/testing/test_utils.h
+++ b/mace/ops/testing/test_utils.h
--- a/mace/ops/time_offset.cc
+++ b/mace/ops/time_offset.cc
--- a/mace/ops/time_offset_benchmark.cc
+++ b/mace/ops/time_offset_benchmark.cc
--- a/mace/ops/time_offset_test.cc
+++ b/mace/ops/time_offset_test.cc
--- a/mace/port/BUILD.bazel
+++ b/mace/port/BUILD.bazel
--- a/mace/port/README.md
+++ b/mace/port/README.md
--- a/mace/port/android/BUILD.bazel
+++ b/mace/port/android/BUILD.bazel
--- a/mace/port/android/env.cc
+++ b/mace/port/android/env.cc
--- a/mace/port/android/env.h
+++ b/mace/port/android/env.h
--- a/mace/utils/logging.cc
+++ b/mace/utils/logging.cc
--- a/mace/port/android/logger.h
+++ b/mace/port/android/logger.h
--- a/mace/port/android/malloc_logger.cc
+++ b/mace/port/android/malloc_logger.cc
--- a/mace/port/android/malloc_logger.h
+++ b/mace/port/android/malloc_logger.h
--- a/mace/port/darwin/BUILD.bazel
+++ b/mace/port/darwin/BUILD.bazel
--- a/mace/port/darwin/env.cc
+++ b/mace/port/darwin/env.cc
--- a/mace/port/darwin/env.h
+++ b/mace/port/darwin/env.h
--- a/mace/port/env.cc
+++ b/mace/port/env.cc
--- a/mace/port/env.h
+++ b/mace/port/env.h
--- a/mace/port/env_test.cc
+++ b/mace/port/env_test.cc
--- a/mace/port/file_system.h
+++ b/mace/port/file_system.h
--- a/mace/port/linux/BUILD.bazel
+++ b/mace/port/linux/BUILD.bazel
--- a/mace/port/linux/env.cc
+++ b/mace/port/linux/env.cc
--- a/mace/port/linux/env.h
+++ b/mace/port/linux/env.h
--- a/mace/port/logger.cc
+++ b/mace/port/logger.cc
--- a/mace/port/logger.h
+++ b/mace/port/logger.h
--- a/mace/port/logger_test.cc
+++ b/mace/port/logger_test.cc
--- a/mace/port/posix/BUILD.bazel
+++ b/mace/port/posix/BUILD.bazel
--- a/mace/port/posix/backtrace.h
+++ b/mace/port/posix/backtrace.h
--- a/mace/port/posix/file_system.cc
+++ b/mace/port/posix/file_system.cc
--- a/mace/port/posix/file_system.h
+++ b/mace/port/posix/file_system.h
--- a/mace/utils/env_time.h
+++ b/mace/utils/env_time.h
--- a/mace/proto/BUILD
+++ b/mace/proto/BUILD
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
--- a/mace/public/BUILD
+++ b/mace/public/BUILD
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
--- a/mace/core/status.cc
+++ b/mace/core/status.cc
--- a/mace/python/tools/BUILD
+++ b/mace/python/tools/BUILD
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
--- a/mace/python/tools/converter_tool/caffe_converter.py
+++ b/mace/python/tools/converter_tool/caffe_converter.py
--- a/mace/python/tools/converter_tool/hexagon_converter.py
+++ b/mace/python/tools/converter_tool/hexagon_converter.py
--- a/mace/python/tools/converter_tool/onnx_converter.py
+++ b/mace/python/tools/converter_tool/onnx_converter.py
--- a/mace/python/tools/converter_tool/shape_inference.py
+++ b/mace/python/tools/converter_tool/shape_inference.py
--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
--- a/mace/python/tools/operator.jinja2
+++ b/mace/python/tools/operator.jinja2
--- a/mace/python/tools/quantization/quantize_util.py
+++ b/mace/python/tools/quantization/quantize_util.py
--- a/mace/python/tools/tensor_source.jinja2
+++ b/mace/python/tools/tensor_source.jinja2
--- a/mace/python/tools/visualization/BUILD
+++ b/mace/python/tools/visualization/BUILD
--- a/mace/python/tools/visualization/index.html
+++ b/mace/python/tools/visualization/index.html
--- a/mace/test/BUILD
+++ b/mace/test/BUILD
--- a/mace/test/mace_api_exception_test.cc
+++ b/mace/test/mace_api_exception_test.cc
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
--- a/mace/test/mace_api_test.h
+++ b/mace/test/mace_api_test.h
--- a/mace/tools/validation/BUILD
+++ b/mace/tools/validation/BUILD
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
--- a/mace/utils/BUILD
+++ b/mace/utils/BUILD
--- a/mace/utils/conf_util.h
+++ b/mace/utils/conf_util.h
--- a/mace/utils/detection_output.cc
+++ b/mace/utils/detection_output.cc
--- a/mace/utils/logging.h
+++ b/mace/utils/logging.h
--- a/mace/utils/logging_test.cc
+++ b/mace/utils/logging_test.cc
--- a/mace/core/macros.h
+++ b/mace/core/macros.h
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
--- a/mace/utils/memory.h
+++ b/mace/utils/memory.h
--- a/mace/utils/memory_logging.h
+++ b/mace/utils/memory_logging.h
--- a/mace/utils/quantize.h
+++ b/mace/utils/quantize.h
--- a/mace/utils/rwlock.h
+++ b/mace/utils/rwlock.h
--- a/mace/utils/stl_util.h
+++ b/mace/utils/stl_util.h
--- a/mace/utils/string_util.cc
+++ b/mace/utils/string_util.cc
--- a/mace/utils/string_util.h
+++ b/mace/utils/string_util.h
--- a/mace/utils/string_util_test.cc
+++ b/mace/utils/string_util_test.cc
--- a/mace/utils/timer.h
+++ b/mace/utils/timer.h
--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
--- a/mace/utils/utils.cc
+++ b/mace/utils/utils.cc
--- a/repository/git/BUILD
+++ b/repository/git/BUILD
--- a/repository/git/BUILD.tpl
+++ b/repository/git/BUILD.tpl
--- a/repository/git/git_configure.bzl
+++ b/repository/git/git_configure.bzl
--- a/repository/opencl-kernel/BUILD
+++ b/repository/opencl-kernel/BUILD
--- a/repository/opencl-kernel/BUILD.tpl
+++ b/repository/opencl-kernel/BUILD.tpl
--- a/repository/opencl-kernel/opencl_kernel_configure.bzl
+++ b/repository/opencl-kernel/opencl_kernel_configure.bzl
--- a/third_party/caffe/BUILD
+++ b/third_party/caffe/BUILD
--- a/third_party/caffe/caffe.proto
+++ b/third_party/caffe/caffe.proto
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
--- a/third_party/hta/BUILD
+++ b/third_party/hta/BUILD
--- a/third_party/hta/arm64-v8a/libcdsprpc.so
+++ b/third_party/hta/arm64-v8a/libcdsprpc.so
--- a/third_party/hta/arm64-v8a/libhta_controller.so
+++ b/third_party/hta/arm64-v8a/libhta_controller.so
--- a/third_party/hta/arm64-v8a/libhta_hexagon_runtime.so
+++ b/third_party/hta/arm64-v8a/libhta_hexagon_runtime.so
--- a/third_party/hta/arm64-v8a/libnpu.so
+++ b/third_party/hta/arm64-v8a/libnpu.so
--- a/third_party/hta/armeabi-v7a/libhta_controller.so
+++ b/third_party/hta/armeabi-v7a/libhta_controller.so
--- a/third_party/hta/armeabi-v7a/libhta_hexagon_runtime.so
+++ b/third_party/hta/armeabi-v7a/libhta_hexagon_runtime.so
--- a/third_party/hta/armeabi-v7a/libnpu.so
+++ b/third_party/hta/armeabi-v7a/libnpu.so
--- a/third_party/hta/hta_hexagon_api.h
+++ b/third_party/hta/hta_hexagon_api.h
--- a/third_party/hta/hta_hexagon_nn_ops.h
+++ b/third_party/hta/hta_hexagon_nn_ops.h
--- a/third_party/hta/hta_ops.h
+++ b/third_party/hta/hta_ops.h
--- a/third_party/hta/libhta_dsp_skel.so
+++ b/third_party/hta/libhta_dsp_skel.so
--- a/third_party/hta/license.txt
+++ b/third_party/hta/license.txt
--- a/third_party/nnlib/BUILD
+++ b/third_party/nnlib/BUILD
--- a/tools/aarch64_compiler/BUILD
+++ b/tools/aarch64_compiler/BUILD
--- a/tools/aarch64_compiler/linaro_linux_gcc/BUILD
+++ b/tools/aarch64_compiler/linaro_linux_gcc/BUILD
--- a/tools/arm_compiler/BUILD
+++ b/tools/arm_compiler/BUILD
--- a/tools/arm_compiler/CROSSTOOL
+++ b/tools/arm_compiler/CROSSTOOL
--- a/tools/arm_compiler/linaro_linux_gcc/BUILD
+++ b/tools/arm_compiler/linaro_linux_gcc/BUILD
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
--- a/tools/common.py
+++ b/tools/common.py
--- a/tools/converter.py
+++ b/tools/converter.py
--- a/tools/device.py
+++ b/tools/device.py
--- a/tools/generate_data.py
+++ b/tools/generate_data.py
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
--- a/tools/validate.py
+++ b/tools/validate.py